@@ -62,7 +62,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::LoadFro | |||||
char *data = new (std::nothrow) char[len]; | char *data = new (std::nothrow) char[len]; | ||||
if (data == nullptr) { | if (data == nullptr) { | ||||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Load model From file failed, bad memory allocation occur. (need:%u)", len); | |||||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Load][ModelFromFile]Failed, " | GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Load][ModelFromFile]Failed, " | ||||
"bad memory allocation occur(need %u), file %s", len, model_path); | "bad memory allocation occur(need %u), file %s", len, model_path); | ||||
REPORT_CALL_ERROR("E19999", "Load model from file %s failed, " | REPORT_CALL_ERROR("E19999", "Load model from file %s failed, " | ||||
@@ -90,33 +89,45 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::ParseMo | |||||
GE_CHECK_NOTNULL(model.model_data); | GE_CHECK_NOTNULL(model.model_data); | ||||
// Model length too small | // Model length too small | ||||
GE_CHK_BOOL_RET_STATUS(model.model_len >= sizeof(ModelFileHeader), ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, | |||||
"Invalid model. Model data size %u must be greater than or equal to %zu.", model.model_len, | |||||
sizeof(ModelFileHeader)); | |||||
GE_CHK_BOOL_EXEC(model.model_len >= sizeof(ModelFileHeader), | |||||
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}), | |||||
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"})); | |||||
GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, | |||||
"[Check][Param] Invalid model. Model data size %u must be greater than or equal to %zu.", | |||||
model.model_len, sizeof(ModelFileHeader)); | |||||
return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;); | |||||
// Get file header | // Get file header | ||||
auto file_header = reinterpret_cast<ModelFileHeader *>(model.model_data); | auto file_header = reinterpret_cast<ModelFileHeader *>(model.model_data); | ||||
// Determine whether the file length and magic number match | // Determine whether the file length and magic number match | ||||
GE_CHK_BOOL_RET_STATUS( | |||||
file_header->length == model.model_len - sizeof(ModelFileHeader) && file_header->magic == MODEL_FILE_MAGIC_NUM, | |||||
ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, | |||||
"Invalid model. file_header->length[%u] + sizeof(ModelFileHeader)[%zu] != model->model_len[%u] || " | |||||
"MODEL_FILE_MAGIC_NUM[%u] != file_header->magic[%u]", | |||||
file_header->length, sizeof(ModelFileHeader), model.model_len, MODEL_FILE_MAGIC_NUM, file_header->magic); | |||||
GE_CHK_BOOL_EXEC(file_header->length == model.model_len - sizeof(ModelFileHeader) && | |||||
file_header->magic == MODEL_FILE_MAGIC_NUM, | |||||
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}), | |||||
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"})); | |||||
GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, | |||||
"[Check][Param] Invalid model, file_header->length[%u] + sizeof(ModelFileHeader)[%zu] != " | |||||
"model->model_len[%u] || MODEL_FILE_MAGIC_NUM[%u] != file_header->magic[%u]", | |||||
file_header->length, sizeof(ModelFileHeader), model.model_len, | |||||
MODEL_FILE_MAGIC_NUM, file_header->magic); | |||||
return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;); | |||||
Status res = SUCCESS; | Status res = SUCCESS; | ||||
// Get data address | // Get data address | ||||
uint8_t *data = reinterpret_cast<uint8_t *>(model.model_data) + sizeof(ModelFileHeader); | uint8_t *data = reinterpret_cast<uint8_t *>(model.model_data) + sizeof(ModelFileHeader); | ||||
if (file_header->is_encrypt == ModelEncryptType::UNENCRYPTED) { // Unencrypted model | if (file_header->is_encrypt == ModelEncryptType::UNENCRYPTED) { // Unencrypted model | ||||
GE_CHK_BOOL_RET_STATUS(model.key.empty(), ACL_ERROR_GE_PARAM_INVALID, | |||||
"Invalid param. model is unencrypted, but key is not empty."); | |||||
if (!model.key.empty()) { | |||||
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}), | |||||
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"})); | |||||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, | |||||
"[Check][Param] Invalid param, model is unencrypted, but key is not empty."); | |||||
return ACL_ERROR_GE_PARAM_INVALID; | |||||
} | |||||
model_data = data; | model_data = data; | ||||
model_len = file_header->length; | model_len = file_header->length; | ||||
GELOGD("Model_len is %u, model_file_head_len is %zu.", model_len, sizeof(ModelFileHeader)); | GELOGD("Model_len is %u, model_file_head_len is %zu.", model_len, sizeof(ModelFileHeader)); | ||||
} else { | } else { | ||||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param]Invalid, model encrypt type not supported"); | GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param]Invalid, model encrypt type not supported"); | ||||
REPORT_CALL_ERROR("E19999","Invalid model, encrypt type not supported"); | |||||
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}), | |||||
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"})); | |||||
res = ACL_ERROR_GE_PARAM_INVALID; | res = ACL_ERROR_GE_PARAM_INVALID; | ||||
} | } | ||||
@@ -33,12 +33,12 @@ Status GraphLoader::UnloadModel(uint32_t model_id) { | |||||
Status ret = model_manager->Stop(model_id); | Status ret = model_manager->Stop(model_id); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "UnloadModel: Stop failed. model id:%u", model_id); | |||||
GELOGE(ret, "[Stop][Model] failed. model id:%u", model_id); | |||||
} | } | ||||
ret = model_manager->Unload(model_id); | ret = model_manager->Unload(model_id); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "UnloadModel: Unload failed. model id:%u", model_id); | |||||
GELOGE(ret, "[Unload][Model] failed. model id:%u", model_id); | |||||
return ret; | return ret; | ||||
} | } | ||||
GELOGI("UnLoad model success, model id:%u.", model_id); | GELOGI("UnLoad model success, model id:%u.", model_id); | ||||
@@ -50,14 +50,13 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge | |||||
GELOGI("Load model online begin."); | GELOGI("Load model online begin."); | ||||
rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); | rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", | |||||
GetContext().DeviceId(), rt_ret); | |||||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||||
return RT_FAILED; | return RT_FAILED; | ||||
} | } | ||||
if (ge_root_model_ptr == nullptr) { | if (ge_root_model_ptr == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Check param ge_root_model_ptr nullptr, check invalid"); | REPORT_INNER_ERROR("E19999", "Check param ge_root_model_ptr nullptr, check invalid"); | ||||
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[LoadGraph] GE load graph model_ptr is nullptr."); | |||||
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[LoadGraph][Check][Param] GE load graph model_ptr is nullptr."); | |||||
return GE_GRAPH_PARAM_NULLPTR; | return GE_GRAPH_PARAM_NULLPTR; | ||||
} | } | ||||
@@ -65,12 +64,12 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge | |||||
GE_CHECK_NOTNULL(model_manager); | GE_CHECK_NOTNULL(model_manager); | ||||
Status ret = model_manager->LoadModelOnline(model_id, ge_root_model_ptr, listener); | Status ret = model_manager->LoadModelOnline(model_id, ge_root_model_ptr, listener); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "LoadModel: Load failed. ret = %u", ret); | |||||
GELOGE(ret, "[Load][Model] Online failed. ret = %u, model_id:%u", ret, model_id); | |||||
rt_ret = rtDeviceReset(GetContext().DeviceId()); | rt_ret = rtDeviceReset(GetContext().DeviceId()); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | ||||
GetContext().DeviceId(), rt_ret); | GetContext().DeviceId(), rt_ret); | ||||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||||
} | } | ||||
return ret; | return ret; | ||||
} | } | ||||
@@ -81,31 +80,31 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge | |||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | ||||
GetContext().DeviceId(), rt_ret); | GetContext().DeviceId(), rt_ret); | ||||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
ret = model_manager->Start(model_id); | ret = model_manager->Start(model_id); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
if (model_manager->Unload(model_id) != SUCCESS) { | if (model_manager->Unload(model_id) != SUCCESS) { | ||||
GELOGE(ret, "LoadModel: Unload failed while trying to unload after a failed start."); | |||||
GELOGE(ret, "[Unload][Model] failed while trying to unload after a failed start, model_id:%u.", model_id); | |||||
} | } | ||||
rt_ret = rtDeviceReset(GetContext().DeviceId()); | rt_ret = rtDeviceReset(GetContext().DeviceId()); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | ||||
GetContext().DeviceId(), rt_ret); | GetContext().DeviceId(), rt_ret); | ||||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||||
} | } | ||||
GELOGE(ret, "LoadModel: Start failed."); | |||||
GELOGE(ret, "[Start][Model] failed, model_id:%u.", model_id); | |||||
return ret; | return ret; | ||||
} | } | ||||
rt_ret = rtDeviceReset(GetContext().DeviceId()); | rt_ret = rtDeviceReset(GetContext().DeviceId()); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | ||||
GetContext().DeviceId(), rt_ret); | GetContext().DeviceId(), rt_ret); | ||||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||||
return RT_FAILED; | return RT_FAILED; | ||||
} | } | ||||
GELOGI("Load model online success, model_id:%u.", model_id); | GELOGI("Load model online success, model_id:%u.", model_id); | ||||
@@ -118,7 +117,7 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) { | |||||
GE_CHECK_NOTNULL(model_manager); | GE_CHECK_NOTNULL(model_manager); | ||||
Status ret = model_manager->GetMaxUsedMemory(model_id, max_size); | Status ret = model_manager->GetMaxUsedMemory(model_id, max_size); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "GetMaxUsedMemory: GetMaxUsedMemory failed."); | |||||
GELOGE(ret, "[Call][GetMaxUsedMemory] failed, model_id:%u.", model_id); | |||||
return ret; | return ret; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -127,21 +126,20 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) { | |||||
Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string &key_path, int32_t priority, | Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string &key_path, int32_t priority, | ||||
ModelData &model_data) { | ModelData &model_data) { | ||||
if (!CheckInputPathValid(path)) { | if (!CheckInputPathValid(path)) { | ||||
GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str()); | |||||
GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "[Check][Param] model path is invalid:%s", path.c_str()); | |||||
return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID; | return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID; | ||||
} | } | ||||
GELOGI("Load model begin, model path is: %s", path.c_str()); | GELOGI("Load model begin, model path is: %s", path.c_str()); | ||||
if (!key_path.empty() && !CheckInputPathValid(key_path)) { | if (!key_path.empty() && !CheckInputPathValid(key_path)) { | ||||
REPORT_INNER_ERROR("E19999", "Param key_path:%s empty or invalid", | |||||
key_path.c_str()); | |||||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "decrypt_key path is invalid: %s", key_path.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "Param key_path:%s empty or invalid", key_path.c_str()); | |||||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param] decrypt_key path is invalid:%s", key_path.c_str()); | |||||
return ACL_ERROR_GE_PARAM_INVALID; | return ACL_ERROR_GE_PARAM_INVALID; | ||||
} | } | ||||
Status ret = ModelParserBase::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data); | Status ret = ModelParserBase::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret); | |||||
GELOGE(ret, "[Call][LoadFromFile] failed. ret = %u, path:%s, key path:%s", ret, path.c_str(), key_path.c_str()); | |||||
if (model_data.model_data != nullptr) { | if (model_data.model_data != nullptr) { | ||||
delete[] static_cast<char *>(model_data.model_data); | delete[] static_cast<char *>(model_data.model_data); | ||||
model_data.model_data = nullptr; | model_data.model_data = nullptr; | ||||
@@ -156,18 +154,19 @@ Status GraphLoader::CommandHandle(const Command &command) { | |||||
GE_CHECK_NOTNULL(model_manager); | GE_CHECK_NOTNULL(model_manager); | ||||
Status ret = model_manager->HandleCommand(command); | Status ret = model_manager->HandleCommand(command); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "CommandHandle: Command Handle failed."); | |||||
GELOGE(ret, "[Handle][Command] failed, module_index:%lu.", command.module_index); | |||||
return ret; | return ret; | ||||
} | } | ||||
} catch (std::bad_alloc &) { | } catch (std::bad_alloc &) { | ||||
REPORT_INNER_ERROR("E19999", "Bad memory allocation occur"); | REPORT_INNER_ERROR("E19999", "Bad memory allocation occur"); | ||||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Command handle failed, bad memory allocation occur !"); | |||||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Handle][Command] failed, " | |||||
"bad memory allocation occur, module_index:%lu.", command.module_index); | |||||
return ACL_ERROR_GE_MEMORY_ALLOCATION; | return ACL_ERROR_GE_MEMORY_ALLOCATION; | ||||
} catch (...) { | } catch (...) { | ||||
REPORT_INNER_ERROR("E19999", "Some exceptions occur"); | REPORT_INNER_ERROR("E19999", "Some exceptions occur"); | ||||
GELOGE(FAILED, "Command handle failed, some exceptions occur !"); | |||||
GELOGE(FAILED, "[Handle][Command] failed, some exceptions occur, module_index:%lu.", command.module_index); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
@@ -184,7 +183,7 @@ Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model | |||||
Status ret = model_manager->LoadModelOffline( | Status ret = model_manager->LoadModelOffline( | ||||
model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size); | model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Load model failed, model_id:%u.", model_id); | |||||
GELOGE(ret, "[Load][Model] failed, model_id:%u.", model_id); | |||||
return ret; | return ret; | ||||
} | } | ||||
GELOGI("Load model success, model_id:%u.", model_id); | GELOGI("Load model success, model_id:%u.", model_id); | ||||
@@ -210,7 +209,7 @@ Status GraphLoader::LoadModelWithQ(uint32_t &model_id, const ModelData &model_da | |||||
GE_CHECK_NOTNULL(model_manager); | GE_CHECK_NOTNULL(model_manager); | ||||
Status ret = model_manager->LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids); | Status ret = model_manager->LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Load model with queue failed, model_id:%u.", model_id); | |||||
GELOGE(ret, "[Load][Model] with queue failed, model_id:%u.", model_id); | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -237,7 +236,7 @@ Status GraphLoader::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asyn | |||||
Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, | Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, | ||||
input_data, input_desc, output_data, output_desc); | input_data, input_desc, output_data, output_desc); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Execute model failed, model_id:%u.", model_id); | |||||
GELOGE(ret, "[Execute][Model] failed, model_id:%u.", model_id); | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -250,7 +249,7 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) { | |||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", | REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", | ||||
GetContext().DeviceId(), rt_ret); | GetContext().DeviceId(), rt_ret); | ||||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||||
return RT_FAILED; | return RT_FAILED; | ||||
} | } | ||||
size_t total_mem = 0; | size_t total_mem = 0; | ||||
@@ -258,14 +257,14 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) { | |||||
rt_ret = rtMemGetInfo(&free_mem, &total_mem); | rt_ret = rtMemGetInfo(&free_mem, &total_mem); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtMemGetInfo failed, ret:0x%X", rt_ret); | REPORT_CALL_ERROR("E19999", "Call rtMemGetInfo failed, ret:0x%X", rt_ret); | ||||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtMemGetInfo] failed, ret:0x%X", rt_ret); | |||||
return RT_FAILED; | return RT_FAILED; | ||||
} | } | ||||
rt_ret = rtDeviceReset(GetContext().DeviceId()); | rt_ret = rtDeviceReset(GetContext().DeviceId()); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | ||||
GetContext().DeviceId(), rt_ret); | GetContext().DeviceId(), rt_ret); | ||||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||||
return RT_FAILED; | return RT_FAILED; | ||||
} | } | ||||
// Add small page memory size | // Add small page memory size | ||||
@@ -280,7 +279,8 @@ Status GraphLoader::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id, u | |||||
GE_CHECK_NOTNULL(model_manager); | GE_CHECK_NOTNULL(model_manager); | ||||
Status ret = model_manager->DestroyAicpuKernel(session_id, model_id, sub_model_id); | Status ret = model_manager->DestroyAicpuKernel(session_id, model_id, sub_model_id); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Destroy aicpu kernel failed."); | |||||
GELOGE(ret, "[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u.", | |||||
session_id, model_id, sub_model_id); | |||||
return ret; | return ret; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -291,7 +291,7 @@ Status GraphLoader::DestroyAicpuSessionForInfer(uint32_t model_id) { | |||||
GE_CHECK_NOTNULL(model_manager); | GE_CHECK_NOTNULL(model_manager); | ||||
Status ret = model_manager->DestroyAicpuSessionForInfer(model_id); | Status ret = model_manager->DestroyAicpuSessionForInfer(model_id); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Destroy aicpu serrion for infer failed."); | |||||
GELOGE(ret, "[Call][DestroyAicpuSessionForInfer] failed, model_id:%u.", model_id); | |||||
return ret; | return ret; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -310,7 +310,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { | |||||
std::lock_guard<std::mutex> lock(exeception_infos_mutex_); | std::lock_guard<std::mutex> lock(exeception_infos_mutex_); | ||||
auto instance = ModelManager::GetInstance(); | auto instance = ModelManager::GetInstance(); | ||||
if (instance == nullptr) { | if (instance == nullptr) { | ||||
GELOGE(FAILED, "Instance is nullptr"); | |||||
GELOGE(FAILED, "[Get][Instance] failed, as ret is nullptr"); | |||||
return; | return; | ||||
} | } | ||||
instance->AddExceptionInfo(*rt_exception_info); | instance->AddExceptionInfo(*rt_exception_info); | ||||
@@ -26,10 +26,10 @@ | |||||
#define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \ | #define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \ | ||||
do { \ | do { \ | ||||
if (SIZE <= static_cast<uint64_t>(OFFSET)) { \ | if (SIZE <= static_cast<uint64_t>(OFFSET)) { \ | ||||
REPORT_INNER_ERROR("E19999", \ | |||||
"Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \ | |||||
REPORT_INNER_ERROR("E19999", "Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \ | |||||
OP->GetName().c_str(), OP->GetType().c_str(), OFFSET, SIZE); \ | OP->GetName().c_str(), OP->GetType().c_str(), OFFSET, SIZE); \ | ||||
GELOGE(OUT_OF_MEMORY, "Node: %s, memory out of range[%lu: %ld]", OP->GetName().c_str(), SIZE, OFFSET); \ | |||||
GELOGE(OUT_OF_MEMORY, "[Check][Param]Node: %s, memory out of range[%lu: %ld]", \ | |||||
OP->GetName().c_str(), SIZE, OFFSET); \ | |||||
return {}; \ | return {}; \ | ||||
} \ | } \ | ||||
} while (0) | } while (0) | ||||
@@ -312,8 +312,9 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co | |||||
REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s), check invalid", | REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s), check invalid", | ||||
ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, | ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
GELOGE(PARAM_INVALID, "Fusion: check input size failed, op: %s, input v_memory_type size: %zu input numbers: %zu", | |||||
op_desc->GetName().c_str(), v_memory_type.size(), inputs_size); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s)", | |||||
ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
return v_input_data_addr; | return v_input_data_addr; | ||||
} | } | ||||
for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { | for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { | ||||
@@ -392,8 +393,7 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc | |||||
case RT_MEMORY_RDMA_HBM: | case RT_MEMORY_RDMA_HBM: | ||||
if (offset < 0) { | if (offset < 0) { | ||||
REPORT_INNER_ERROR("E19999", "Param offset:%ld < 0, check invalid", offset); | REPORT_INNER_ERROR("E19999", "Param offset:%ld < 0, check invalid", offset); | ||||
GELOGE(PARAM_INVALID, "rdma var addr is invalid, addr=%p", | |||||
reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(offset))); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Param offset:%ld cannot be negative", offset); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
var_addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(offset)); | var_addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(offset)); | ||||
@@ -403,9 +403,9 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc | |||||
var_addr = model_param.var_base + offset - model_param.logic_var_base; | var_addr = model_param.var_base + offset - model_param.logic_var_base; | ||||
break; | break; | ||||
default: | default: | ||||
REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid", | |||||
mem_type, offset); | |||||
GELOGE(PARAM_INVALID, "unsupported memory type %u", mem_type); | |||||
REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid", mem_type, offset); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Get mem_type:%d for offset:%ld is unsupported, check invalid", | |||||
mem_type, offset); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
GE_CHECK_NOTNULL(var_addr); | GE_CHECK_NOTNULL(var_addr); | ||||
@@ -433,9 +433,9 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C | |||||
REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s), check invalid", | REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s), check invalid", | ||||
ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, | ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
GELOGE(PARAM_INVALID, | |||||
"Fusion: check output size failed, op: %s, output v_memory_type size: %lu output numbers: %zu", | |||||
op_desc->GetName().c_str(), v_memory_type.size(), outputs_size); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s)", | |||||
ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
return v_output_data_addr; | return v_output_data_addr; | ||||
} | } | ||||
for (size_t i = 0; i < outputs_size; ++i) { | for (size_t i = 0; i < outputs_size; ++i) { | ||||
@@ -594,7 +594,7 @@ Status ModelUtils::GetRtAddress(const RuntimeParam ¶m, uintptr_t logic_addr, | |||||
} else if (logic_addr != 0) { | } else if (logic_addr != 0) { | ||||
mem_addr = nullptr; | mem_addr = nullptr; | ||||
REPORT_INNER_ERROR("E19999", "Check param logic addr:0x%lx abnormal", logic_addr); | REPORT_INNER_ERROR("E19999", "Check param logic addr:0x%lx abnormal", logic_addr); | ||||
GELOGE(PARAM_INVALID, "The logic addr:0x%lx is abnormal", logic_addr); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] The logic addr:0x%lx is abnormal", logic_addr); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -24,7 +24,7 @@ namespace ge { | |||||
void TbeHandleInfo::used_inc(uint32_t num) { | void TbeHandleInfo::used_inc(uint32_t num) { | ||||
if (used_ > std::numeric_limits<uint32_t>::max() - num) { | if (used_ > std::numeric_limits<uint32_t>::max() - num) { | ||||
REPORT_INNER_ERROR("E19999", "Used:%u reach numeric max", used_); | REPORT_INNER_ERROR("E19999", "Used:%u reach numeric max", used_); | ||||
GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric max.", used_); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] Used[%u] reach numeric max.", used_); | |||||
return; | return; | ||||
} | } | ||||
@@ -34,7 +34,7 @@ void TbeHandleInfo::used_inc(uint32_t num) { | |||||
void TbeHandleInfo::used_dec(uint32_t num) { | void TbeHandleInfo::used_dec(uint32_t num) { | ||||
if (used_ < std::numeric_limits<uint32_t>::min() + num) { | if (used_ < std::numeric_limits<uint32_t>::min() + num) { | ||||
REPORT_INNER_ERROR("E19999", "Used:%u reach numeric min", used_); | REPORT_INNER_ERROR("E19999", "Used:%u reach numeric min", used_); | ||||
GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric min.", used_); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] Used[%u] reach numeric min.", used_); | |||||
return; | return; | ||||
} | } | ||||
@@ -107,9 +107,8 @@ void TBEHandleStore::ReferTBEHandle(const std::string &name) { | |||||
std::lock_guard<std::mutex> lock(mutex_); | std::lock_guard<std::mutex> lock(mutex_); | ||||
auto it = kernels_.find(name); | auto it = kernels_.find(name); | ||||
if (it == kernels_.end()) { | if (it == kernels_.end()) { | ||||
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", | |||||
name.c_str()); | |||||
GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", name.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", name.c_str()); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] Kernel[%s] not found in stored.", name.c_str()); | |||||
return; | return; | ||||
} | } | ||||
@@ -128,9 +127,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map<std::string, uint32_t> &names | |||||
for (auto &item : names) { | for (auto &item : names) { | ||||
auto it = kernels_.find(item.first); | auto it = kernels_.find(item.first); | ||||
if (it == kernels_.end()) { | if (it == kernels_.end()) { | ||||
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", | |||||
item.first.c_str()); | |||||
GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", item.first.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", item.first.c_str()); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] Kernel[%s] not found in stored.", item.first.c_str()); | |||||
continue; | continue; | ||||
} | } | ||||
@@ -142,7 +140,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map<std::string, uint32_t> &names | |||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_INNER_ERROR("E19999", "Call rtDevBinaryUnRegister failed for Kernel:%s fail, ret:0x%X", | REPORT_INNER_ERROR("E19999", "Call rtDevBinaryUnRegister failed for Kernel:%s fail, ret:0x%X", | ||||
item.first.c_str(), rt_ret); | item.first.c_str(), rt_ret); | ||||
GELOGE(INTERNAL_ERROR, "Kernel[%s] UnRegister handle fail:%u.", item.first.c_str(), rt_ret); | |||||
GELOGE(INTERNAL_ERROR, "[Call][RtDevBinaryUnRegister] Kernel[%s] UnRegister handle fail:%u.", | |||||
item.first.c_str(), rt_ret); | |||||
} | } | ||||
kernels_.erase(it); | kernels_.erase(it); | ||||
} | } | ||||
@@ -43,7 +43,7 @@ class TsMemMall { | |||||
for (auto it : mem_store_size_) { | for (auto it : mem_store_size_) { | ||||
rtError_t ret = rtFree(it.second); | rtError_t ret = rtFree(it.second); | ||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret); | |||||
GELOGE(RT_FAILED, "[Call][RtFree] failed, ret:0x%X", ret); | |||||
} | } | ||||
} | } | ||||
mem_store_size_.clear(); | mem_store_size_.clear(); | ||||
@@ -52,7 +52,7 @@ class TsMemMall { | |||||
void *Acquire(int64_t offset, uint64_t size) { | void *Acquire(int64_t offset, uint64_t size) { | ||||
if (size == 0) { | if (size == 0) { | ||||
GELOGE(RT_FAILED, "Acquire mem block failed, size: %lu", size); | |||||
GELOGE(RT_FAILED, "[Check][Param] Acquire mem block failed, size:%lu", size); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -71,7 +71,7 @@ class TsMemMall { | |||||
void *addr = nullptr; | void *addr = nullptr; | ||||
rtError_t rt_ret = rtMalloc(&addr, bytes, mem_type_); | rtError_t rt_ret = rtMalloc(&addr, bytes, mem_type_); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtMalloc] failed, size:%lu, ret:0x%X", bytes, rt_ret); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -94,7 +94,7 @@ class TsMemMall { | |||||
mem_store_addr_.erase(it); | mem_store_addr_.erase(it); | ||||
rtError_t ret = rtFree(addr); | rtError_t ret = rtFree(addr); | ||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret); | |||||
GELOGE(RT_FAILED, "[Call][RtFree] failed, ret:0x%X", ret); | |||||
} | } | ||||
} | } | ||||
@@ -38,8 +38,13 @@ Status ZeroCopyOffset::InitInputDataInfo(int64_t output_size, void *virtual_addr | |||||
op_name_ = op_desc->GetName(); | op_name_ = op_desc->GetName(); | ||||
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); | (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); | ||||
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); | (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); | ||||
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID, | |||||
"basic_offset_size should be equal to relative_offset_size"); | |||||
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), | |||||
REPORT_INNER_ERROR("E19999", "basic_offset_size:%zu not equal to relative_offset_size:%zu, " | |||||
"check invalid", zero_copy_basic_offset_.size(), | |||||
zero_copy_relative_offset_.size()); | |||||
return PARAM_INVALID, | |||||
"[Check][Param] basic_offset_size:%zu should be equal to relative_offset_size:%zu", | |||||
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); | |||||
GELOGD("[ZCPY] zero_copy_basic_offset size is %zu", zero_copy_basic_offset_.size()); | GELOGD("[ZCPY] zero_copy_basic_offset size is %zu", zero_copy_basic_offset_.size()); | ||||
int64_t virtual_addr_offset = op_desc->GetOutputOffset().at(kDataIndex); | int64_t virtual_addr_offset = op_desc->GetOutputOffset().at(kDataIndex); | ||||
@@ -78,7 +83,8 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector<int64_t> &input_size_list | |||||
if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, size) != GRAPH_SUCCESS) { | if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, size) != GRAPH_SUCCESS) { | ||||
REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu", | REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu", | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); | op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); | ||||
GELOGE(FAILED, "GetTensorSizeInBytes failed!"); | |||||
GELOGE(FAILED, "[Get][InputTensorSize] in op:%s(%s) failed, input_index:%zu", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
@@ -88,8 +94,13 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector<int64_t> &input_size_list | |||||
op_name_ = op_desc->GetName(); | op_name_ = op_desc->GetName(); | ||||
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); | (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); | ||||
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); | (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); | ||||
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID, | |||||
"basic_offset_size should be equal to relative_offset_size"); | |||||
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), | |||||
REPORT_INNER_ERROR("E19999", "basic_offset_size:%zu not equal to relative_offset_size:%zu, " | |||||
"check invalid", | |||||
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); | |||||
return PARAM_INVALID, | |||||
"[Check][Param] basic_offset_size:%zu should be equal to relative_offset_size:%zu", | |||||
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); | |||||
int64_t virtual_addr_offset = op_desc->GetInputOffset().at(idx); | int64_t virtual_addr_offset = op_desc->GetInputOffset().at(idx); | ||||
IsL2Fusion(zero_copy_basic_offset_, virtual_addr_offset, fusion_flag); | IsL2Fusion(zero_copy_basic_offset_, virtual_addr_offset, fusion_flag); | ||||
@@ -194,7 +205,8 @@ void ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *ou | |||||
for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) { | for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) { | ||||
auto args_addrs = outside_addrs_[out_count].find(outside_addr); | auto args_addrs = outside_addrs_[out_count].find(outside_addr); | ||||
if (args_addrs != outside_addrs_[out_count].end()) { | if (args_addrs != outside_addrs_[out_count].end()) { | ||||
GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), "Input args invalid."); | |||||
GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), | |||||
"[Set][TaskArgsOffset] failed, Input args invalid, offset:%zu.", offset); | |||||
void *args_val = static_cast<uint8_t *>(args) + offset; | void *args_val = static_cast<uint8_t *>(args) + offset; | ||||
args_addrs->second.push_back(args_val); | args_addrs->second.push_back(args_val); | ||||
GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val, | GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val, | ||||
@@ -36,9 +36,9 @@ ZeroCopyTask::~ZeroCopyTask() { args_addr_ = nullptr; } | |||||
*/ | */ | ||||
Status ZeroCopyTask::SetTaskArgsOffset(uintptr_t addr, size_t offset) { | Status ZeroCopyTask::SetTaskArgsOffset(uintptr_t addr, size_t offset) { | ||||
if (offset + sizeof(uintptr_t) > args_size_) { | if (offset + sizeof(uintptr_t) > args_size_) { | ||||
REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid", | |||||
offset, args_size_); | |||||
GELOGE(FAILED, "[ZCPY] %s set task args failed, args size: %zu, offset: %zu", name_.c_str(), args_size_, offset); | |||||
REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid", offset, args_size_); | |||||
GELOGE(FAILED, "[Check][Param] [ZCPY] %s set task args failed, args size:%zu, offset:%zu", | |||||
name_.c_str(), args_size_, offset); | |||||
return FAILED; // unexpected error, need fix. | return FAILED; // unexpected error, need fix. | ||||
} | } | ||||
@@ -118,9 +118,8 @@ Status ZeroCopyTask::DistributeParam(bool async_mode, rtStream_t stream) { | |||||
} | } | ||||
if (rt_err != RT_ERROR_NONE) { | if (rt_err != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret: 0x%X", | |||||
args_size_, rt_err); | |||||
GELOGE(RT_FAILED, "[ZCPY] %s distribute task param failed, error=0x%x", name_.c_str(), rt_err); | |||||
REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret:0x%X", args_size_, rt_err); | |||||
GELOGE(RT_FAILED, "[Distribute][TaskParam] for %s failed, error = 0x%x", name_.c_str(), rt_err); | |||||
return RT_ERROR_TO_GE_STATUS(rt_err); | return RT_ERROR_TO_GE_STATUS(rt_err); | ||||
} | } | ||||
@@ -112,7 +112,7 @@ Status CachingAllocator::Initialize(uint32_t device_id) { | |||||
auto bin_ptr = new (std::nothrow) BlockBin(BlockComparator); | auto bin_ptr = new (std::nothrow) BlockBin(BlockComparator); | ||||
if (bin_ptr == nullptr) { | if (bin_ptr == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "New BlockBin fail, device_id:%u", device_id); | REPORT_CALL_ERROR("E19999", "New BlockBin fail, device_id:%u", device_id); | ||||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc BlockBin failed."); | |||||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Alloc][BlockBin] failed, device_id:%u", device_id); | |||||
return ACL_ERROR_GE_MEMORY_ALLOCATION; | return ACL_ERROR_GE_MEMORY_ALLOCATION; | ||||
} | } | ||||
free_block_bins_[i] = bin_ptr; | free_block_bins_[i] = bin_ptr; | ||||
@@ -147,9 +147,8 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device | |||||
ptr = block->ptr; | ptr = block->ptr; | ||||
} | } | ||||
if (ptr == nullptr) { | if (ptr == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u", | |||||
size, device_id); | |||||
GELOGE(FAILED, "Malloc failed device id = %u, size= %zu", device_id, size); | |||||
REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u", size, device_id); | |||||
GELOGE(FAILED, "[Check][Param] FindFreeBlock failed device id = %u, size= %zu", device_id, size); | |||||
} | } | ||||
return ptr; | return ptr; | ||||
} | } | ||||
@@ -157,18 +156,16 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device | |||||
Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) { | Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) { | ||||
GELOGI("Free device id = %u", device_id); | GELOGI("Free device id = %u", device_id); | ||||
if (ptr == nullptr) { | if (ptr == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid", | |||||
device_id); | |||||
GELOGE(PARAM_INVALID, "Invalid memory pointer"); | |||||
REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid", device_id); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer, device_id:%u", device_id); | |||||
return ge::PARAM_INVALID; | return ge::PARAM_INVALID; | ||||
} | } | ||||
std::lock_guard<std::recursive_mutex> lock(mutex_); | std::lock_guard<std::recursive_mutex> lock(mutex_); | ||||
auto it = allocated_blocks_.find(ptr); | auto it = allocated_blocks_.find(ptr); | ||||
if (it == allocated_blocks_.end()) { | if (it == allocated_blocks_.end()) { | ||||
REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid", | |||||
device_id); | |||||
GELOGE(PARAM_INVALID, "Invalid memory pointer: %p", ptr); | |||||
REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid", device_id); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Param ptr not allocated before, device_id:%u", device_id); | |||||
return ge::PARAM_INVALID; | return ge::PARAM_INVALID; | ||||
} | } | ||||
Block *block = it->second; | Block *block = it->second; | ||||
@@ -225,9 +222,8 @@ Block *CachingAllocator::FindFreeBlock(size_t size, uint8_t *org_ptr, uint32_t d | |||||
Block key(device_id, size, org_ptr); | Block key(device_id, size, org_ptr); | ||||
BlockBin *bin = GetBlockBin(size); | BlockBin *bin = GetBlockBin(size); | ||||
if (bin == nullptr) { | if (bin == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", | |||||
size, device_id); | |||||
GELOGE(ge::FAILED, "Get block bin failed size = %zu", size); | |||||
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", size, device_id); | |||||
GELOGE(ge::FAILED, "[Get][BlockBin] failed, size:%zu, device_id:%u", size, device_id); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
std::lock_guard<std::recursive_mutex> lock(mutex_); | std::lock_guard<std::recursive_mutex> lock(mutex_); | ||||
@@ -258,9 +254,8 @@ Block *CachingAllocator::SplitBlock(Block *block, size_t size, BlockBin &bin, ui | |||||
Block *remaining = block; | Block *remaining = block; | ||||
Block *new_block = new (std::nothrow) Block(device_id, size, &bin, block->ptr); | Block *new_block = new (std::nothrow) Block(device_id, size, &bin, block->ptr); | ||||
if (new_block == nullptr) { | if (new_block == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", | |||||
size, device_id); | |||||
GELOGE(ge::FAILED, "Alloc block failed size = %zu", size); | |||||
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", size, device_id); | |||||
GELOGE(ge::FAILED, "[Alloc][Block] failed, size:%zu, device_id:%u", size, device_id); | |||||
return block; | return block; | ||||
} | } | ||||
new_block->prev = remaining->prev; | new_block->prev = remaining->prev; | ||||
@@ -285,7 +280,7 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) { | |||||
size_t free_cached_memory_size = FreeCachedBlocks(); | size_t free_cached_memory_size = FreeCachedBlocks(); | ||||
memory_addr = memory_allocator_->MallocMemory(purpose, memory_size, device_id); | memory_addr = memory_allocator_->MallocMemory(purpose, memory_size, device_id); | ||||
if (memory_addr == nullptr) { | if (memory_addr == nullptr) { | ||||
GELOGE(ge::FAILED, "TryExtendCache failed, no enough memory for size = %zu, device_id = %u", memory_size, | |||||
GELOGE(ge::FAILED, "[Malloc][Memory] failed, no enough memory for size = %zu, device_id = %u", memory_size, | |||||
device_id); | device_id); | ||||
return ge::FAILED; | return ge::FAILED; | ||||
} | } | ||||
@@ -304,16 +299,14 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) { | |||||
Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id) { | Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id) { | ||||
BlockBin *bin = GetBlockBin(size); | BlockBin *bin = GetBlockBin(size); | ||||
if (bin == nullptr) { | if (bin == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", | |||||
size, device_id); | |||||
GELOGE(ge::FAILED, "Get block bin failed size = %zu", size); | |||||
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", size, device_id); | |||||
GELOGE(ge::FAILED, "[Get][BlockBin] failed, size:%zu, device_id:%u", size, device_id); | |||||
return ge::FAILED; | return ge::FAILED; | ||||
} | } | ||||
Block *block = new (std::nothrow) Block(device_id, size, bin, nullptr); | Block *block = new (std::nothrow) Block(device_id, size, bin, nullptr); | ||||
if (block == nullptr) { | if (block == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", | |||||
size, device_id); | |||||
GELOGE(ge::FAILED, "Alloc block failed size = %zu", size); | |||||
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", size, device_id); | |||||
GELOGE(ge::FAILED, "[Alloc][Block] failed, size:%zu, device_id:%u", size, device_id); | |||||
return ge::FAILED; | return ge::FAILED; | ||||
} | } | ||||
@@ -33,7 +33,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) { | |||||
if (compute_graph_ == nullptr) { | if (compute_graph_ == nullptr) { | ||||
std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph(); | std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph(); | ||||
if (graph == nullptr) { | if (graph == nullptr) { | ||||
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!"); | |||||
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[Get][Graph] failed, compute_graph by graphNode is NULL!"); | |||||
return; | return; | ||||
} | } | ||||
@@ -45,7 +45,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) { | |||||
Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { | Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { | ||||
if (graph_node == nullptr) { | if (graph_node == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param graph_node is nullptr, check invalid"); | REPORT_INNER_ERROR("E19999", "Param graph_node is nullptr, check invalid"); | ||||
GELOGE(GE_GRAPH_PARAM_NULLPTR, "graphNode is NULL!"); | |||||
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] graphNode is NULL!"); | |||||
return GE_GRAPH_PARAM_NULLPTR; | return GE_GRAPH_PARAM_NULLPTR; | ||||
} | } | ||||
@@ -56,7 +56,7 @@ Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { | |||||
std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph(); | std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph(); | ||||
if (graph == nullptr) { | if (graph == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param graph in graph_node is nullptr, check invalid"); | REPORT_INNER_ERROR("E19999", "Param graph in graph_node is nullptr, check invalid"); | ||||
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!"); | |||||
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[Get][Graph] failed, compute_graph by graphNode is NULL!"); | |||||
return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; | return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; | ||||
} | } | ||||
@@ -73,14 +73,15 @@ Status GraphContext::Finalize() const { return SUCCESS; } | |||||
Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTensor &returned_tensor) { | Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTensor &returned_tensor) { | ||||
if (var_data_name.empty()) { | if (var_data_name.empty()) { | ||||
REPORT_INNER_ERROR("E19999", "Param var_data_name is empty, check invalid"); | REPORT_INNER_ERROR("E19999", "Param var_data_name is empty, check invalid"); | ||||
GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "Variable data name is empty!"); | |||||
GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "[Check][Param] Variable data name is empty!"); | |||||
return GE_GRAPH_EMPTY_STRING_NAME; | return GE_GRAPH_EMPTY_STRING_NAME; | ||||
} | } | ||||
if (GetVarNodeTensorTable().empty()) { | if (GetVarNodeTensorTable().empty()) { | ||||
REPORT_INNER_ERROR("E19999", "VarNodeTensorTable is empty, var_data_name:%s, check invalid", | REPORT_INNER_ERROR("E19999", "VarNodeTensorTable is empty, var_data_name:%s, check invalid", | ||||
var_data_name.c_str()); | var_data_name.c_str()); | ||||
GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "VarNodeTensorTable is empty!"); | |||||
GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "[Check][Param] VarNodeTensorTable is empty, var_data_name:%s", | |||||
var_data_name.c_str()); | |||||
return GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE; | return GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE; | ||||
} | } | ||||
for (auto &var_record : GetVarNodeTensorTable()) { | for (auto &var_record : GetVarNodeTensorTable()) { | ||||
@@ -88,9 +89,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso | |||||
returned_tensor.SetTensorDesc(var_record.second.GetTensorDesc()); | returned_tensor.SetTensorDesc(var_record.second.GetTensorDesc()); | ||||
auto ret = returned_tensor.SetData(var_record.second.GetData()); | auto ret = returned_tensor.SetData(var_record.second.GetData()); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s", | |||||
var_data_name.c_str()); | |||||
GELOGE(ret, "Set Tensor data failed!"); | |||||
REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s", var_data_name.c_str()); | |||||
GELOGE(ret, "[Set][Data] to Tensor failed, var_data_name:%s", var_data_name.c_str()); | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -100,7 +100,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso | |||||
REPORT_INNER_ERROR("E19999", "VarRecord with data_name:%s does not exist, check invalid", | REPORT_INNER_ERROR("E19999", "VarRecord with data_name:%s does not exist, check invalid", | ||||
var_data_name.c_str()); | var_data_name.c_str()); | ||||
GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "VarRecord with data_name %s does NOT exist!", var_data_name.c_str()); | |||||
GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "[Check][Param] VarRecord with data_name %s does NOT exist!", | |||||
var_data_name.c_str()); | |||||
return GE_GRAPH_VARIABLE_DOES_NOT_EXIST; | return GE_GRAPH_VARIABLE_DOES_NOT_EXIST; | ||||
} | } | ||||
@@ -46,7 +46,7 @@ GraphNode::GraphNode(GraphId graph_id) | |||||
sem_(1) { | sem_(1) { | ||||
graph_run_async_listener_ = MakeShared<RunAsyncListener>(); | graph_run_async_listener_ = MakeShared<RunAsyncListener>(); | ||||
if (graph_run_async_listener_ == nullptr) { | if (graph_run_async_listener_ == nullptr) { | ||||
GELOGE(MEMALLOC_FAILED, "Make shared failed"); | |||||
GELOGE(MEMALLOC_FAILED, "[New][RunAsyncListener] failed"); | |||||
} | } | ||||
} | } | ||||
@@ -82,7 +82,8 @@ SubGraphInfo::~SubGraphInfo() { | |||||
rt_ret = rtFreeHost(buffer_addr); | rt_ret = rtFreeHost(buffer_addr); | ||||
buffer_addr = nullptr; | buffer_addr = nullptr; | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); | |||||
GELOGE(rt_ret, "[Call][RtFreeHost] subgraph free buffer failed, modelId = %u", | |||||
model_id_info_.model_id); | |||||
} | } | ||||
} | } | ||||
} | } | ||||
@@ -94,8 +95,8 @@ Status SubGraphInfo::FreeInOutBuffer() { | |||||
rtError_t rt_ret; | rtError_t rt_ret; | ||||
rt_ret = rtFreeHost(*iter); | rt_ret = rtFreeHost(*iter); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail"); | |||||
GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); | |||||
REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail, ret:%d", rt_ret); | |||||
GELOGE(rt_ret, "[Call][RtFreeHost] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); | |||||
buffer_addr_.erase(buffer_addr_.begin(), iter); | buffer_addr_.erase(buffer_addr_.begin(), iter); | ||||
return GE_GRAPH_FREE_FAILED; | return GE_GRAPH_FREE_FAILED; | ||||
} | } | ||||
@@ -131,7 +132,7 @@ Status GraphModelListener::OnComputeDone(uint32_t model_id, uint32_t task_id, ui | |||||
uint32_t GraphModelListener::GetResultCode() const { | uint32_t GraphModelListener::GetResultCode() const { | ||||
if (!is_finished_) { | if (!is_finished_) { | ||||
REPORT_CALL_ERROR("E19999", "Model not run finish"); | REPORT_CALL_ERROR("E19999", "Model not run finish"); | ||||
GELOGE(INTERNAL_ERROR, "[GraphManager] model not run finish."); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] model not run finish."); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
return result_code_; | return result_code_; | ||||
@@ -170,7 +171,9 @@ bool HasCalcOp(const ComputeGraphPtr &graph) { | |||||
for (const auto &node : graph->GetAllNodes()) { | for (const auto &node : graph->GetAllNodes()) { | ||||
OpDescPtr op_desc = node->GetOpDesc(); | OpDescPtr op_desc = node->GetOpDesc(); | ||||
GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(FAILED, "Node GetOpDesc is nullptr"); return false); | |||||
GE_IF_BOOL_EXEC(op_desc == nullptr, | |||||
REPORT_INNER_ERROR("E19999", "GetOpDesc failed, Node GetOpDesc is nullptr"); | |||||
GELOGE(FAILED, "[Get][OpDesc] failed, Node GetOpDesc is nullptr"); return false); | |||||
if (calc_op_type.find(op_desc->GetType()) != calc_op_type.end()) { | if (calc_op_type.find(op_desc->GetType()) != calc_op_type.end()) { | ||||
return true; | return true; | ||||
} | } | ||||
@@ -50,9 +50,7 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, size_t memory_size | |||||
if (rtMalloc(reinterpret_cast<void **>(&memory_addr), memory_size, memory_type_) != RT_ERROR_NONE) { | if (rtMalloc(reinterpret_cast<void **>(&memory_addr), memory_size, memory_type_) != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, purpose:%s, size:%zu, device_id:%u", | REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, purpose:%s, size:%zu, device_id:%u", | ||||
purpose.c_str(), memory_size, device_id); | purpose.c_str(), memory_size, device_id); | ||||
GELOGE(ge::INTERNAL_ERROR, | |||||
"MemoryAllocator::MallocMemory device_id = %u," | |||||
" size= %lu", | |||||
GELOGE(ge::INTERNAL_ERROR, "[Malloc][Memory] failed, device_id = %u, size= %lu", | |||||
device_id, memory_size); | device_id, memory_size); | ||||
return nullptr; | return nullptr; | ||||
@@ -68,7 +66,7 @@ Status MemoryAllocator::FreeMemory(uint8_t *memory_addr, uint32_t device_id) con | |||||
auto rtRet = rtFree(memory_addr); | auto rtRet = rtFree(memory_addr); | ||||
if (rtRet != RT_ERROR_NONE) { | if (rtRet != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtFree fail, device_id:%u", device_id); | REPORT_CALL_ERROR("E19999", "Call rtFree fail, device_id:%u", device_id); | ||||
GELOGE(rtRet, "MemoryAllocator::MallocMemory device_id = %u", device_id); | |||||
GELOGE(rtRet, "[Call][RtFree] failed, device_id = %u", device_id); | |||||
return RT_ERROR_TO_GE_STATUS(rtRet); | return RT_ERROR_TO_GE_STATUS(rtRet); | ||||
} | } | ||||
memory_addr = nullptr; | memory_addr = nullptr; | ||||
@@ -88,10 +86,8 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, const string &memo | |||||
if (memory_addr == nullptr) { | if (memory_addr == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "Malloc Memory fail, purpose:%s, memory_key:%s, memory_size:%zu, device_id:%u", | REPORT_CALL_ERROR("E19999", "Malloc Memory fail, purpose:%s, memory_key:%s, memory_size:%zu, device_id:%u", | ||||
purpose.c_str(), memory_key.c_str(), memory_size, device_id); | purpose.c_str(), memory_key.c_str(), memory_size, device_id); | ||||
GELOGE(ge::INTERNAL_ERROR, | |||||
"MemoryAllocator::MallocMemory failed," | |||||
" memory_key[%s], size = %lu.", | |||||
memory_key.c_str(), memory_size); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Malloc][Memory] failed, memory_key[%s], size = %lu, device_id:%u.", | |||||
memory_key.c_str(), memory_size, device_id); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -126,10 +122,8 @@ Status MemoryAllocator::FreeMemory(const string &memory_key, uint32_t device_id) | |||||
if (FreeMemory(it->second.memory_addr_, device_id) != ge::SUCCESS) { | if (FreeMemory(it->second.memory_addr_, device_id) != ge::SUCCESS) { | ||||
REPORT_CALL_ERROR("E19999", "Free Memory fail, memory_key:%s, device_id:%u", | REPORT_CALL_ERROR("E19999", "Free Memory fail, memory_key:%s, device_id:%u", | ||||
memory_key.c_str(), device_id); | memory_key.c_str(), device_id); | ||||
GELOGE(ge::INTERNAL_ERROR, | |||||
"MemoryAllocator::FreeMemory rtFree failed," | |||||
" memory_key[%s]", | |||||
memory_key.c_str()); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Free][Memory] failed, memory_key[%s], device_id:%u", | |||||
memory_key.c_str(), device_id); | |||||
return ge::INTERNAL_ERROR; | return ge::INTERNAL_ERROR; | ||||
} | } | ||||
@@ -40,7 +40,8 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens | |||||
if (dev_ptr == nullptr) { | if (dev_ptr == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param dev_ptr is nullptr, var_name:%s, session_id:%lu, " | REPORT_INNER_ERROR("E19999", "Param dev_ptr is nullptr, var_name:%s, session_id:%lu, " | ||||
"check invalid", var_name.c_str(), session_id_); | "check invalid", var_name.c_str(), session_id_); | ||||
GELOGE(FAILED, "[GetVarAddr] dev_ptr is null!"); | |||||
GELOGE(FAILED, "[Check][Param] Param dev_ptr is nullptr, var_name:%s, session_id:%lu", | |||||
var_name.c_str(), session_id_); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
std::string var_key = VarKey(var_name, tensor_desc); | std::string var_key = VarKey(var_name, tensor_desc); | ||||
@@ -51,7 +52,8 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens | |||||
REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, " | REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, " | ||||
"check invalid", var_key.c_str(), var_name.c_str(), | "check invalid", var_key.c_str(), var_name.c_str(), | ||||
session_id_); | session_id_); | ||||
GELOGE(FAILED, "VarResource::GetVarAddr failed, var_key %s", var_key.c_str()); | |||||
GELOGE(FAILED, "[Check][Param] var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu", | |||||
var_key.c_str(), var_name.c_str(), session_id_); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
@@ -109,7 +111,8 @@ ge::Status VarResource::SaveVarAddr(const std::string &var_name, const ge::GeTen | |||||
REPORT_INNER_ERROR("E19999", "var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu, " | REPORT_INNER_ERROR("E19999", "var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu, " | ||||
"check invalid", var_key.c_str(), var_name.c_str(), | "check invalid", var_key.c_str(), var_name.c_str(), | ||||
session_id_); | session_id_); | ||||
GELOGE(FAILED, "VarResource::SaveVarAddr, var_key %s save addr conflict", var_key.c_str()); | |||||
GELOGE(FAILED, "[Check][Param] var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu", | |||||
var_key.c_str(), var_name.c_str(), session_id_); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
@@ -145,14 +148,15 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O | |||||
if (op_desc == nullptr) { | if (op_desc == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param op_desc is nullptr, var_name:%s, session_id:%lu, check invalid", | REPORT_INNER_ERROR("E19999", "Param op_desc is nullptr, var_name:%s, session_id:%lu, check invalid", | ||||
var_name.c_str(), session_id_); | var_name.c_str(), session_id_); | ||||
GELOGE(FAILED, "[RenewCurVarDesc] renew var desc fail! input opdesc is null!"); | |||||
GELOGE(FAILED, "[Check][Param] input opdesc is nullptr, var_name:%s, session_id:%lu", | |||||
var_name.c_str(), session_id_); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
ge::GeTensorDesc curr_desc; | ge::GeTensorDesc curr_desc; | ||||
ge::Status ret = GetCurVarDesc(var_name, curr_desc); | ge::Status ret = GetCurVarDesc(var_name, curr_desc); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(FAILED, "[RenewCurVarDesc] Get var desc fail!"); | |||||
GELOGE(FAILED, "[Get][CurVarDesc] fail, var_name:%s, session_id:%lu", var_name.c_str(), session_id_); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
std::string key = VarKey(var_name, curr_desc); | std::string key = VarKey(var_name, curr_desc); | ||||
@@ -164,7 +168,8 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O | |||||
REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s), " | REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s), " | ||||
"check invalid", key.c_str(), var_name.c_str(), | "check invalid", key.c_str(), var_name.c_str(), | ||||
session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
GELOGE(FAILED, "[RenewCurVarDesc] can't find ele with key [%s]", key.c_str()); | |||||
GELOGE(FAILED, "[Check][Param] var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s)", | |||||
key.c_str(), var_name.c_str(), session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
auto val = iter->second; | auto val = iter->second; | ||||
@@ -285,14 +290,15 @@ Status HbmMemResource::AssignVarMem(const std::string &var_name, uint64_t size, | |||||
if (total_size_ < var_mem_size_) { | if (total_size_ < var_mem_size_) { | ||||
REPORT_INNER_ERROR("E19999", "VarMemMaxSize:%lu < var_mem_size_:%lu, var_size:%lu, var_name:%s, check invalid" | REPORT_INNER_ERROR("E19999", "VarMemMaxSize:%lu < var_mem_size_:%lu, var_size:%lu, var_name:%s, check invalid" | ||||
"", total_size_, var_mem_size_, size, var_name.c_str()); | "", total_size_, var_mem_size_, size, var_name.c_str()); | ||||
GELOGE(PARAM_INVALID, "total_size_: %lu is smaller than var_mem_size_: %lu", total_size_, var_mem_size_); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] total_size_:%lu is smaller than var_mem_size_:%lu, var_name:%s", | |||||
total_size_, var_mem_size_, var_name.c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
uint64_t free_size = total_size_ - var_mem_size_; | uint64_t free_size = total_size_ - var_mem_size_; | ||||
if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) { | if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) { | ||||
REPORT_INNER_ERROR("E19999", "free_size:%lu not enough, var_align_size:%lu, var_name:%s, check invalid", | REPORT_INNER_ERROR("E19999", "free_size:%lu not enough, var_align_size:%lu, var_name:%s, check invalid", | ||||
free_size, size, var_name.c_str()); | free_size, size, var_name.c_str()); | ||||
GELOGE(PARAM_INVALID, "Out of memory : current var size[%lu] exceeds total var size[%lu]", | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Out of memory: current var size[%lu] exceeds total var size[%lu]", | |||||
size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_); | size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_); | ||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -317,7 +323,7 @@ Status RdmaMemResource::AssignVarMem(const std::string &var_name, uint64_t size, | |||||
if (buffer == nullptr) { | if (buffer == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "malloc rdma memory fail, var_size:%lu, var_name:%s", | REPORT_CALL_ERROR("E19999", "malloc rdma memory fail, var_size:%lu, var_name:%s", | ||||
size, var_name.c_str()); | size, var_name.c_str()); | ||||
GELOGE(MEMALLOC_FAILED, "Failed to malloc rdma memory for node %s, size = %lu", var_name.c_str(), size); | |||||
GELOGE(MEMALLOC_FAILED, "[Malloc][RdmaMemory] for node %s failed, size = %lu", var_name.c_str(), size); | |||||
return MEMALLOC_FAILED; | return MEMALLOC_FAILED; | ||||
} | } | ||||
address = static_cast<size_t>(reinterpret_cast<uintptr_t>(buffer)); | address = static_cast<size_t>(reinterpret_cast<uintptr_t>(buffer)); | ||||
@@ -468,7 +474,8 @@ int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) { | |||||
if (mem_resource == nullptr) { | if (mem_resource == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Find no mem_resource in map, memory_type:%d, session_id:%lu", | REPORT_INNER_ERROR("E19999", "Find no mem_resource in map, memory_type:%d, session_id:%lu", | ||||
memory_type, session_id_); | memory_type, session_id_); | ||||
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%d, session_id:%lu", | |||||
memory_type, session_id_); | |||||
return 0; | return 0; | ||||
} | } | ||||
return mem_resource->GetVarMemSize(); | return mem_resource->GetVarMemSize(); | ||||
@@ -483,7 +490,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { | |||||
if (mem_resource == nullptr) { | if (mem_resource == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", | REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", | ||||
memory_type, session_id_); | memory_type, session_id_); | ||||
GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Alloc][MemResource] failed, memory_type:%u, session_id:%lu", | |||||
memory_type, session_id_); | |||||
return ge::INTERNAL_ERROR; | return ge::INTERNAL_ERROR; | ||||
} else { | } else { | ||||
mem_resource_map_[memory_type] = mem_resource; | mem_resource_map_[memory_type] = mem_resource; | ||||
@@ -495,7 +503,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { | |||||
if (mem_resource == nullptr) { | if (mem_resource == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", | REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", | ||||
memory_type, session_id_); | memory_type, session_id_); | ||||
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%u, session_id:%lu", | |||||
memory_type, session_id_); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
mem_resource->UpdateVarMemSize(mem_size); | mem_resource->UpdateVarMemSize(mem_size); | ||||
@@ -515,7 +524,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen | |||||
if (result != ge::SUCCESS) { | if (result != ge::SUCCESS) { | ||||
REPORT_CALL_ERROR("E19999", "Get size from tensor fail, var_name:%s, memory_type:%d, session_id:%lu", | REPORT_CALL_ERROR("E19999", "Get size from tensor fail, var_name:%s, memory_type:%d, session_id:%lu", | ||||
var_name.c_str(), memory_type, session_id_); | var_name.c_str(), memory_type, session_id_); | ||||
GELOGE(result, "get size from TensorDesc failed"); | |||||
GELOGE(result, "[Get][Size] from tensor fail, var_name:%s, memory_type:%u, session_id:%lu", | |||||
var_name.c_str(), memory_type, session_id_); | |||||
return result; | return result; | ||||
} | } | ||||
@@ -526,7 +536,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen | |||||
if (mem_resource == nullptr) { | if (mem_resource == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", | REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", | ||||
memory_type, session_id_); | memory_type, session_id_); | ||||
GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Alloc][MemResource] failed, memory_type:%u, session_id:%lu.", | |||||
memory_type, session_id_); | |||||
return ge::INTERNAL_ERROR; | return ge::INTERNAL_ERROR; | ||||
} else { | } else { | ||||
mem_resource_map_[memory_type] = mem_resource; | mem_resource_map_[memory_type] = mem_resource; | ||||
@@ -538,7 +549,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen | |||||
if (mem_resource == nullptr) { | if (mem_resource == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", | REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", | ||||
memory_type, session_id_); | memory_type, session_id_); | ||||
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid, memory_type = %u.", memory_type); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%u, session_id:%lu.", | |||||
memory_type, session_id_); | |||||
return ge::INTERNAL_ERROR; | return ge::INTERNAL_ERROR; | ||||
} | } | ||||
@@ -567,14 +579,15 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen | |||||
if (can_not_reuse_old_memory) { | if (can_not_reuse_old_memory) { | ||||
result = mem_resource->AssignVarMem(var_name, tensor_desc_size, session_id_, mem_offset); | result = mem_resource->AssignVarMem(var_name, tensor_desc_size, session_id_, mem_offset); | ||||
if (result != SUCCESS) { | if (result != SUCCESS) { | ||||
GELOGE(ge::INTERNAL_ERROR, "AssignVarMem by offset failed."); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Assign][VarMem] by offset failed, session_id:%lu.", session_id_); | |||||
return ge::INTERNAL_ERROR; | return ge::INTERNAL_ERROR; | ||||
} | } | ||||
result = var_resource_->SaveVarAddr( | result = var_resource_->SaveVarAddr( | ||||
var_name, tensor_desc, reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(mem_offset)), memory_type); | var_name, tensor_desc, reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(mem_offset)), memory_type); | ||||
if (result != SUCCESS) { | if (result != SUCCESS) { | ||||
GELOGE(ge::INTERNAL_ERROR, "AssignVarMem by offset failed."); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Save][VarAddr] by offset failed, memory type:%u, session_id:%lu.", | |||||
memory_type, session_id_); | |||||
return ge::INTERNAL_ERROR; | return ge::INTERNAL_ERROR; | ||||
} | } | ||||
} | } | ||||
@@ -681,7 +694,8 @@ ge::Status VarManager::RenewCurVarDesc(const std::string &var_name, ge::OpDescPt | |||||
REPORT_INNER_ERROR("E19999", "VarManager has not been init, op:%s(%s), session_id:%lu, check invalid", | REPORT_INNER_ERROR("E19999", "VarManager has not been init, op:%s(%s), session_id:%lu, check invalid", | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), | op_desc->GetName().c_str(), op_desc->GetType().c_str(), | ||||
session_id_); | session_id_); | ||||
GELOGE(ge::INTERNAL_ERROR, "VarManager has not been init."); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] VarManager has not been init, op:%s(%s), session_id:%lu", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), session_id_); | |||||
return ge::INTERNAL_ERROR; | return ge::INTERNAL_ERROR; | ||||
} | } | ||||
return var_resource_->RenewCurVarDesc(var_name, std::move(op_desc)); | return var_resource_->RenewCurVarDesc(var_name, std::move(op_desc)); | ||||
@@ -729,10 +743,8 @@ ge::Status VarManager::MallocVarMemory(size_t memory_size) { | |||||
const string purpose("variables and constant op memory in training network."); | const string purpose("variables and constant op memory in training network."); | ||||
var_mem_base = MemManager::Instance().MemInstance(RT_MEMORY_HBM).MallocMemory(purpose, memory_key, var_memory_size); | var_mem_base = MemManager::Instance().MemInstance(RT_MEMORY_HBM).MallocMemory(purpose, memory_key, var_memory_size); | ||||
if (var_mem_base == nullptr) { | if (var_mem_base == nullptr) { | ||||
GELOGE(ge::INTERNAL_ERROR, | |||||
"VarManager::MallocVarMemory failed " | |||||
"session_id = %s", | |||||
memory_key.c_str()); | |||||
GELOGE(ge::INTERNAL_ERROR, "[Malloc][VarMemory] failed, size:%zu, session_id:%s", | |||||
var_memory_size, memory_key.c_str()); | |||||
return ge::INTERNAL_ERROR; | return ge::INTERNAL_ERROR; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -812,7 +824,7 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) { | |||||
string graph_memory_manager_malloc_max_size = it->second; | string graph_memory_manager_malloc_max_size = it->second; | ||||
ge::Status ret = ParseMemoryMallocSize(graph_memory_manager_malloc_max_size, graph_mem_max_size_); | ge::Status ret = ParseMemoryMallocSize(graph_memory_manager_malloc_max_size, graph_mem_max_size_); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse graph memory manager malloc max size failed."); | |||||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Call][ParseMemoryMallocSize] failed, session id:%lu.", session_id_); | |||||
return ge::GE_GRAPH_OPTIONS_INVALID; | return ge::GE_GRAPH_OPTIONS_INVALID; | ||||
} | } | ||||
GELOGI("The max size for graph mem is set to %zu", graph_mem_max_size_); | GELOGI("The max size for graph mem is set to %zu", graph_mem_max_size_); | ||||
@@ -825,7 +837,7 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) { | |||||
string memory_var_manager_malloc_size = it->second; | string memory_var_manager_malloc_size = it->second; | ||||
ge::Status ret = ParseMemoryMallocSize(memory_var_manager_malloc_size, var_mem_max_size_); | ge::Status ret = ParseMemoryMallocSize(memory_var_manager_malloc_size, var_mem_max_size_); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse memory var manager malloc size failed."); | |||||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Call][ParseMemoryMallocSize] failed, session id:%lu.", session_id_); | |||||
return ge::GE_GRAPH_OPTIONS_INVALID; | return ge::GE_GRAPH_OPTIONS_INVALID; | ||||
} | } | ||||
} | } | ||||
@@ -834,8 +846,8 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) { | |||||
if (var_mem_logic_base_ > kMaxMemorySize) { | if (var_mem_logic_base_ > kMaxMemorySize) { | ||||
REPORT_INNER_ERROR("E19999", "var_login_base:%zu can not exeed limit:%zu, session_id:%lu, check invalid", | REPORT_INNER_ERROR("E19999", "var_login_base:%zu can not exeed limit:%zu, session_id:%lu, check invalid", | ||||
var_mem_logic_base_, kMaxMemorySize, session_id_); | var_mem_logic_base_, kMaxMemorySize, session_id_); | ||||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kMemoryVarLogicBase : %zu can not exceed max memory size : %zu.", | |||||
var_mem_logic_base_, kMaxMemorySize); | |||||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Check][Param] kMemoryVarLogicBase:%zu can not exceed " | |||||
"max memory size:%zu, session_id:%lu.", var_mem_logic_base_, kMaxMemorySize, session_id_); | |||||
return ge::GE_GRAPH_OPTIONS_INVALID; | return ge::GE_GRAPH_OPTIONS_INVALID; | ||||
} | } | ||||
@@ -843,8 +855,8 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) { | |||||
if (use_max_mem_size_ > kMaxMemorySize) { | if (use_max_mem_size_ > kMaxMemorySize) { | ||||
REPORT_INNER_ERROR("E19999", "all mem_use size:%zu can not exeed limit:%zu, session_id:%lu, check invalid", | REPORT_INNER_ERROR("E19999", "all mem_use size:%zu can not exeed limit:%zu, session_id:%lu, check invalid", | ||||
use_max_mem_size_, kMaxMemorySize, session_id_); | use_max_mem_size_, kMaxMemorySize, session_id_); | ||||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kUseMaxMemorySize : %zu can not exceed max memory size : %zu.", | |||||
use_max_mem_size_, kMaxMemorySize); | |||||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Check][Param] kUseMaxMemorySize:%zu can not exceed " | |||||
"max memory size:%zu, session_id:%lu.", use_max_mem_size_, kMaxMemorySize, session_id_); | |||||
return ge::GE_GRAPH_OPTIONS_INVALID; | return ge::GE_GRAPH_OPTIONS_INVALID; | ||||
} | } | ||||
GELOGI("Set memory malloc size successfully"); | GELOGI("Set memory malloc size successfully"); | ||||
@@ -855,7 +867,7 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { | |||||
if (memory_size.empty()) { | if (memory_size.empty()) { | ||||
REPORT_INNER_ERROR("E19999", "Param memory_size is empty, session_id:%lu, check invalid", | REPORT_INNER_ERROR("E19999", "Param memory_size is empty, session_id:%lu, check invalid", | ||||
session_id_); | session_id_); | ||||
GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input is empty."); | |||||
GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] Memory malloc size input is empty, session_id:%lu.", session_id_); | |||||
return GE_GRAPH_OPTIONS_INVALID; | return GE_GRAPH_OPTIONS_INVALID; | ||||
} | } | ||||
// split string by '*' | // split string by '*' | ||||
@@ -882,7 +894,9 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { | |||||
if (!isdigit(c)) { | if (!isdigit(c)) { | ||||
REPORT_INNER_ERROR("E19999", "Param memory_size:%s contains non digit, session_id:%lu, check invalid", | REPORT_INNER_ERROR("E19999", "Param memory_size:%s contains non digit, session_id:%lu, check invalid", | ||||
memory_size.c_str(), session_id_); | memory_size.c_str(), session_id_); | ||||
GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input contains non digit."); | |||||
GELOGE(GE_GRAPH_OPTIONS_INVALID, | |||||
"[Check][Param] Memory malloc size:%s input contains non digit, session_id:%lu.", | |||||
memory_size.c_str(), session_id_); | |||||
return GE_GRAPH_OPTIONS_INVALID; | return GE_GRAPH_OPTIONS_INVALID; | ||||
} | } | ||||
} | } | ||||
@@ -891,13 +905,15 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { | |||||
REPORT_INNER_ERROR("E19999", "Param memory_size:%s will overflow after multi all, session_id:%lu, " | REPORT_INNER_ERROR("E19999", "Param memory_size:%s will overflow after multi all, session_id:%lu, " | ||||
"check invalid", memory_size.c_str(), | "check invalid", memory_size.c_str(), | ||||
session_id_); | session_id_); | ||||
GELOGE(FAILED, "Input memory size is out of range."); | |||||
GELOGE(FAILED, "[Check][Param] Param memory_size:%s will overflow after multi all, session_id:%lu", | |||||
memory_size.c_str(), session_id_); | |||||
return FAILED); | return FAILED); | ||||
if ((num > kMaxMemorySize) || (result * static_cast<size_t>(num) > kMaxMemorySize)) { | if ((num > kMaxMemorySize) || (result * static_cast<size_t>(num) > kMaxMemorySize)) { | ||||
REPORT_INNER_ERROR("E19999", "Param memory_size:%s after multi will exceed limit:%lu, session_id:%lu, " | REPORT_INNER_ERROR("E19999", "Param memory_size:%s after multi will exceed limit:%lu, session_id:%lu, " | ||||
"check invalid", memory_size.c_str(), kMaxMemorySize, | "check invalid", memory_size.c_str(), kMaxMemorySize, | ||||
session_id_); | session_id_); | ||||
GELOGE(FAILED, "Input memory size can not exceed max memory size : %zu.", kMaxMemorySize); | |||||
GELOGE(FAILED, "[Check][Param] Input memory size can not exceed max memory size:%zu, session_id:%lu.", | |||||
kMaxMemorySize, session_id_); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
result *= static_cast<size_t>(num); | result *= static_cast<size_t>(num); | ||||
@@ -1001,10 +1017,7 @@ VarManager *VarManagerPool::GetVarManager(uint64_t session_id) { | |||||
VarManager *var_manager = new (std::nothrow) VarManager(session_id); | VarManager *var_manager = new (std::nothrow) VarManager(session_id); | ||||
if (var_manager == nullptr) { | if (var_manager == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "New VarManager fail, session_id:%lu", session_id); | REPORT_INNER_ERROR("E19999", "New VarManager fail, session_id:%lu", session_id); | ||||
GELOGE(INTERNAL_ERROR, | |||||
"VarManager::Instance find session by " | |||||
"session_id[%lu] failed.", | |||||
session_id); | |||||
GELOGE(INTERNAL_ERROR, "[New][VarManager] fail, session_id:%lu", session_id); | |||||
static VarManager new_var_manager(0); | static VarManager new_var_manager(0); | ||||
return &new_var_manager; | return &new_var_manager; | ||||
} | } | ||||
@@ -34,8 +34,8 @@ uint8_t *HostMemAllocator::Malloc(size_t size) { | |||||
std::lock_guard<std::mutex> lock(mutex_); | std::lock_guard<std::mutex> lock(mutex_); | ||||
std::shared_ptr<AlignedPtr> aligned_ptr = MakeShared<AlignedPtr>(size); | std::shared_ptr<AlignedPtr> aligned_ptr = MakeShared<AlignedPtr>(size); | ||||
if (aligned_ptr == nullptr) { | if (aligned_ptr == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "New AlignedPtr fail"); | |||||
GELOGE(INTERNAL_ERROR, "make shared_ptr for AlignedPtr failed"); | |||||
REPORT_INNER_ERROR("E19999", "New AlignedPtr fail, size:%zu", size); | |||||
GELOGE(INTERNAL_ERROR, "[Call][MakeShared] for AlignedPtr failed, size:%zu", size); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
allocated_blocks_[aligned_ptr->Get()] = { size, aligned_ptr }; | allocated_blocks_[aligned_ptr->Get()] = { size, aligned_ptr }; | ||||
@@ -46,7 +46,7 @@ uint8_t *HostMemAllocator::Malloc(size_t size) { | |||||
Status HostMemAllocator::Free(const void *memory_addr) { | Status HostMemAllocator::Free(const void *memory_addr) { | ||||
if (memory_addr == nullptr) { | if (memory_addr == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, check invalid"); | REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, check invalid"); | ||||
GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); | |||||
GELOGE(GE_GRAPH_FREE_FAILED, "[Check][Param] Invalid memory pointer"); | |||||
return GE_GRAPH_FREE_FAILED; | return GE_GRAPH_FREE_FAILED; | ||||
} | } | ||||
@@ -54,7 +54,7 @@ Status HostMemAllocator::Free(const void *memory_addr) { | |||||
auto it = allocated_blocks_.find(memory_addr); | auto it = allocated_blocks_.find(memory_addr); | ||||
if (it == allocated_blocks_.end()) { | if (it == allocated_blocks_.end()) { | ||||
REPORT_INNER_ERROR("E19999", "Memory_addr is not alloc before, check invalid"); | REPORT_INNER_ERROR("E19999", "Memory_addr is not alloc before, check invalid"); | ||||
GELOGE(PARAM_INVALID, "Invalid memory pointer"); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer:%p", memory_addr); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
it->second.second.reset(); | it->second.second.reset(); | ||||
@@ -39,9 +39,8 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) { | |||||
rtMallocHostSharedMemoryOut output_para; | rtMallocHostSharedMemoryOut output_para; | ||||
rtError_t rt_ret = rtMallocHostSharedMemory(&input_para, &output_para); | rtError_t rt_ret = rtMallocHostSharedMemory(&input_para, &output_para); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X", | |||||
rt_ret); | |||||
GELOGE(RT_FAILED, "Call rt api(rtMallocHostSharedMemory) failed, devid:[%u].", device_id); | |||||
REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtMallocHostSharedMemory] failed, devid:[%u].", device_id); | |||||
return GE_GRAPH_MEMORY_ALLOC_FAILED; | return GE_GRAPH_MEMORY_ALLOC_FAILED; | ||||
} | } | ||||
mem_info.fd = output_para.fd; | mem_info.fd = output_para.fd; | ||||
@@ -60,9 +59,8 @@ Status SharedMemAllocator::DeAllocate(SharedMemInfo &mem_info) { | |||||
mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address}; | mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address}; | ||||
rtError_t rt_ret = rtFreeHostSharedMemory(&free_para); | rtError_t rt_ret = rtFreeHostSharedMemory(&free_para); | ||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X", | |||||
rt_ret); | |||||
GELOGE(RT_FAILED, "Call rt api(rtFreeHostSharedMemory) failed, ret: 0x%X.", rt_ret); | |||||
REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtFreeHostSharedMemory] failed, ret:0x%X.", rt_ret); | |||||
return RT_FAILED; | return RT_FAILED; | ||||
} | } | ||||
return ge::SUCCESS; | return ge::SUCCESS; | ||||
@@ -78,7 +76,7 @@ Status HostMemManager::Initialize() { | |||||
allocator_ = std::unique_ptr<SharedMemAllocator>(new (std::nothrow) SharedMemAllocator()); | allocator_ = std::unique_ptr<SharedMemAllocator>(new (std::nothrow) SharedMemAllocator()); | ||||
if (allocator_ == nullptr) { | if (allocator_ == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "New SharedMemAllocator fail"); | REPORT_CALL_ERROR("E19999", "New SharedMemAllocator fail"); | ||||
GELOGE(GE_GRAPH_MALLOC_FAILED, "Shared memory allocator init failed!"); | |||||
GELOGE(GE_GRAPH_MALLOC_FAILED, "[New][SharedMemAllocator] failed!"); | |||||
return GE_GRAPH_MALLOC_FAILED; | return GE_GRAPH_MALLOC_FAILED; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -98,9 +96,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) { | |||||
std::lock_guard<std::recursive_mutex> lock(mutex_); | std::lock_guard<std::recursive_mutex> lock(mutex_); | ||||
auto iter = var_memory_base_map_.find(mem_info.op_name); | auto iter = var_memory_base_map_.find(mem_info.op_name); | ||||
if (iter != var_memory_base_map_.end()) { | if (iter != var_memory_base_map_.end()) { | ||||
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", | |||||
mem_info.op_name.c_str()); | |||||
GELOGE(FAILED, "Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); | |||||
GELOGE(FAILED, "[Check][Param] Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
mem_info.shm_name = OpNameToShmName(mem_info.op_name); | mem_info.shm_name = OpNameToShmName(mem_info.op_name); | ||||
@@ -113,9 +110,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) { | |||||
Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size) { | Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size) { | ||||
std::lock_guard<std::recursive_mutex> lock(mutex_); | std::lock_guard<std::recursive_mutex> lock(mutex_); | ||||
if (var_memory_base_map_.find(op_name) == var_memory_base_map_.end()) { | if (var_memory_base_map_.find(op_name) == var_memory_base_map_.end()) { | ||||
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", | |||||
op_name.c_str()); | |||||
GELOGE(INTERNAL_ERROR, "Find host base base_addr failed,node name:%s!", op_name.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", op_name.c_str()); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] Find host base base_addr failed, node name:%s!", op_name.c_str()); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address)); | base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address)); | ||||
@@ -50,9 +50,8 @@ Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t | |||||
path.append(file_name); | path.append(file_name); | ||||
string canonical_path = RealPath(path.c_str()); | string canonical_path = RealPath(path.c_str()); | ||||
if (canonical_path.empty()) { | if (canonical_path.empty()) { | ||||
REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid", | |||||
canonical_path.c_str()); | |||||
GELOGE(FAILED, "Failed to get realpath of %s", path.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid", canonical_path.c_str()); | |||||
GELOGE(FAILED, "[Call][RealPath] Failed to get realpath of %s", path.c_str()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); | GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); | ||||
@@ -69,15 +68,14 @@ Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t | |||||
if (hcom_remote_mem_register == nullptr) { | if (hcom_remote_mem_register == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "Symbol HcomRegRemoteAccessMem can't find in %s, check invalid", | REPORT_CALL_ERROR("E19999", "Symbol HcomRegRemoteAccessMem can't find in %s, check invalid", | ||||
canonical_path.c_str()); | canonical_path.c_str()); | ||||
GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function."); | |||||
GELOGE(FAILED, "[Check][Param] Symbol HcomRegRemoteAccessMem can't find in %s", canonical_path.c_str()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len); | HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len); | ||||
if (hccl_ret != HCCL_SUCCESS) { | if (hccl_ret != HCCL_SUCCESS) { | ||||
REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,", | |||||
hccl_ret); | |||||
GELOGE(HCCL_E_INTERNAL, "Rdma mem register failed, ret: 0x%X", hccl_ret); | |||||
REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,", hccl_ret); | |||||
GELOGE(HCCL_E_INTERNAL, "[Call][HcomRemoteMemRegister] Rdma mem register failed, ret:0x%X", hccl_ret); | |||||
return HCCL_E_INTERNAL; | return HCCL_E_INTERNAL; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -88,14 +86,14 @@ Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uin | |||||
uint32_t type_size = 0; | uint32_t type_size = 0; | ||||
bool result = TypeUtils::GetDataTypeLength(tensor_info.data_type, type_size); | bool result = TypeUtils::GetDataTypeLength(tensor_info.data_type, type_size); | ||||
if (!result) { | if (!result) { | ||||
GELOGE(GRAPH_FAILED, "GetDataTypeLength failed, data_type=(%s).", | |||||
GELOGE(GRAPH_FAILED, "[Get][DataTypeLength] failed, data_type=(%s).", | |||||
TypeUtils::DataTypeToSerialString(tensor_info.data_type).c_str()); | TypeUtils::DataTypeToSerialString(tensor_info.data_type).c_str()); | ||||
return GRAPH_FAILED; | return GRAPH_FAILED; | ||||
} | } | ||||
memory_size = type_size; | memory_size = type_size; | ||||
for (auto dim : tensor_info.dims) { | for (auto dim : tensor_info.dims) { | ||||
if (dim <= 0) { | if (dim <= 0) { | ||||
GELOGE(GRAPH_FAILED, "Tensor dims should be positive"); | |||||
GELOGE(GRAPH_FAILED, "[Check][Param] Tensor dims should be positive"); | |||||
return GRAPH_FAILED; | return GRAPH_FAILED; | ||||
} | } | ||||
memory_size *= dim; | memory_size *= dim; | ||||
@@ -103,7 +101,7 @@ Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uin | |||||
SharedMemInfo mem_info(tensor_info.var_name, memory_size); | SharedMemInfo mem_info(tensor_info.var_name, memory_size); | ||||
Status ret = HostMemManager::Instance().MallocSharedMemory(mem_info); | Status ret = HostMemManager::Instance().MallocSharedMemory(mem_info); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(GRAPH_FAILED, "MallocSharedMemory failed op name [%s]", tensor_info.var_name.c_str()); | |||||
GELOGE(GRAPH_FAILED, "[Malloc][SharedMemory] failed, op name [%s]", tensor_info.var_name.c_str()); | |||||
return GRAPH_FAILED; | return GRAPH_FAILED; | ||||
} | } | ||||
dev_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(mem_info.device_address)); | dev_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(mem_info.device_address)); | ||||
@@ -45,7 +45,7 @@ Status EventManager::Init(size_t event_num) { | |||||
void EventManager::Release() noexcept { | void EventManager::Release() noexcept { | ||||
for (size_t i = 0; i < this->event_list_.size(); ++i) { | for (size_t i = 0; i < this->event_list_.size(); ++i) { | ||||
rtError_t rt_ret = rtEventDestroy(this->event_list_[i]); | rtError_t rt_ret = rtEventDestroy(this->event_list_[i]); | ||||
RETURN_IF_COND_NOT_MET(rt_ret == RT_ERROR_NONE, "Destroy event failed, idx is %zu, ret is 0x%x.", i, rt_ret); | |||||
RETURN_IF_COND_NOT_MET(rt_ret == RT_ERROR_NONE, "[Destroy][Event] failed, idx is %zu, ret is 0x%x.", i, rt_ret); | |||||
} | } | ||||
this->event_list_.clear(); | this->event_list_.clear(); | ||||
@@ -82,8 +82,8 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) { | |||||
auto device_id = GetContext().DeviceId(); | auto device_id = GetContext().DeviceId(); | ||||
GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id); | GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id); | ||||
if (rdma_base_addr_ != nullptr) { | if (rdma_base_addr_ != nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); | |||||
GELOGE(GE_MULTI_INIT, "Rdma pool has been malloced"); | |||||
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is not nullptr, devid:%u, check invalid", device_id); | |||||
GELOGE(GE_MULTI_INIT, "[Check][Param] Rdma pool has been malloced, devid:%u", device_id); | |||||
return GE_MULTI_INIT; | return GE_MULTI_INIT; | ||||
} | } | ||||
const std::string purpose = "Memory for rdma pool."; | const std::string purpose = "Memory for rdma pool."; | ||||
@@ -95,15 +95,15 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) { | |||||
rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id); | rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id); | ||||
if (rdma_base_addr_ == nullptr) { | if (rdma_base_addr_ == nullptr) { | ||||
GELOGE(GE_GRAPH_MALLOC_FAILED, "Rdma pool memory malloc failed"); | |||||
GELOGE(GE_GRAPH_MALLOC_FAILED, "[Malloc][Memory] failed, size:%zu, device_id:%u", mem_size, device_id); | |||||
return GE_GRAPH_MALLOC_FAILED; | return GE_GRAPH_MALLOC_FAILED; | ||||
} | } | ||||
rdma_mem_size_ = mem_size; | rdma_mem_size_ = mem_size; | ||||
// Init with a base block. | // Init with a base block. | ||||
auto *base_block = new (std::nothrow) Block(device_id, mem_size, rdma_base_addr_); | auto *base_block = new (std::nothrow) Block(device_id, mem_size, rdma_base_addr_); | ||||
if (base_block == nullptr) { | if (base_block == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "New Block failed, device_id:%u", device_id); | |||||
GELOGE(GE_GRAPH_MALLOC_FAILED, "Block malloc failed"); | |||||
REPORT_CALL_ERROR("E19999", "New Block failed, size:%zu, device_id:%u", mem_size, device_id); | |||||
GELOGE(GE_GRAPH_MALLOC_FAILED, "[New][Block] failed, size:%zu, device_id:%u", mem_size, device_id); | |||||
return GE_GRAPH_MALLOC_FAILED; | return GE_GRAPH_MALLOC_FAILED; | ||||
} | } | ||||
block_bin_.insert(base_block); | block_bin_.insert(base_block); | ||||
@@ -123,7 +123,7 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { | |||||
if (block->ptr == nullptr) { | if (block->ptr == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Rdmapool memory address is nullptr, device_id:%u, check invalid", | REPORT_INNER_ERROR("E19999", "Rdmapool memory address is nullptr, device_id:%u, check invalid", | ||||
device_id); | device_id); | ||||
GELOGE(INTERNAL_ERROR, "Rdmapool memory address is nullptr."); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] Rdmapool memory address is nullptr, device_id:%u", device_id); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
allocated_blocks_.emplace(block->ptr, block); | allocated_blocks_.emplace(block->ptr, block); | ||||
@@ -155,9 +155,8 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { | |||||
Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { | Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { | ||||
GELOGI("Free rdma memory, device id = %u", device_id); | GELOGI("Free rdma memory, device id = %u", device_id); | ||||
if (memory_addr == nullptr) { | if (memory_addr == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid", | |||||
device_id); | |||||
GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); | |||||
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid", device_id); | |||||
GELOGE(GE_GRAPH_FREE_FAILED, "[Check][Param] Invalid memory pointer, device id:%u", device_id); | |||||
return GE_GRAPH_FREE_FAILED; | return GE_GRAPH_FREE_FAILED; | ||||
} | } | ||||
@@ -166,7 +165,7 @@ Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { | |||||
if (it == allocated_blocks_.end()) { | if (it == allocated_blocks_.end()) { | ||||
REPORT_INNER_ERROR("E19999", "Param memory_addr is not allocated before, device_id:%u, " | REPORT_INNER_ERROR("E19999", "Param memory_addr is not allocated before, device_id:%u, " | ||||
"check invalid", device_id); | "check invalid", device_id); | ||||
GELOGE(PARAM_INVALID, "Invalid memory pointer"); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer, device id:%u", device_id); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -209,7 +208,7 @@ void RdmaPoolAllocator::MergeBlocks(Block *dst, Block *src) { | |||||
Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) { | Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) { | ||||
if (rdma_base_addr_ == nullptr) { | if (rdma_base_addr_ == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); | REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); | ||||
GELOGE(INTERNAL_ERROR, "Rdma base addr is nullptr."); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] Rdma base addr is nullptr."); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(rdma_base_addr_)); | base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(rdma_base_addr_)); | ||||
@@ -37,7 +37,8 @@ class RtContextSwitchGuard { | |||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, device_id:%u, ret:0x%X,", | REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, device_id:%u, ret:0x%X,", | ||||
device_id, ret); | device_id, ret); | ||||
GELOGE(RT_FAILED, "Failed to get current context from rt, error-code %d", ret); | |||||
GELOGE(RT_FAILED, "[Call][RtCtxGetCurrent] Failed to get current context, device_id:%u, ret:0x%X", | |||||
device_id, ret); | |||||
return; | return; | ||||
} | } | ||||
@@ -45,15 +46,14 @@ class RtContextSwitchGuard { | |||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtCtxCreate failed, device_id:%u, ret:0x%X,", | REPORT_CALL_ERROR("E19999", "Call rtCtxCreate failed, device_id:%u, ret:0x%X,", | ||||
device_id, ret); | device_id, ret); | ||||
GELOGE(RT_FAILED, "Failed to create new context for device %u, error-code %d", device_id, ret); | |||||
GELOGE(RT_FAILED, "[Call][RtCtxCreate] Failed to create new context for device:%u, ret:%d", device_id, ret); | |||||
return; | return; | ||||
} | } | ||||
ret = rtCtxSetCurrent(current_); | ret = rtCtxSetCurrent(current_); | ||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X,", | |||||
device_id, ret); | |||||
GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id); | |||||
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X", device_id, ret); | |||||
GELOGE(RT_FAILED, "[Call][RtCtxSetCurrent] failed, device_id:%u, ret:0x%X", device_id, ret); | |||||
return; | return; | ||||
} | } | ||||
GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_); | GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_); | ||||
@@ -80,7 +80,7 @@ int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) { | |||||
if (var_size <= 0) { | if (var_size <= 0) { | ||||
REPORT_INNER_ERROR("E19999", "Data type:%s in desc, it's size:%ld < 0, check invalid", | REPORT_INNER_ERROR("E19999", "Data type:%s in desc, it's size:%ld < 0, check invalid", | ||||
TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str(), var_size); | TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str(), var_size); | ||||
GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s", | |||||
GELOGE(PARAM_INVALID, "[Calc][VarDataSize] by data type %s failed.", | |||||
TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str()); | TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str()); | ||||
return -1; | return -1; | ||||
} | } | ||||
@@ -99,7 +99,8 @@ Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_res | |||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), | REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), | ||||
var->GetType().c_str(), trans_result.length, ret); | var->GetType().c_str(), trans_result.length, ret); | ||||
GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", trans_result.length); | |||||
GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), | |||||
var->GetType().c_str(), trans_result.length, ret); | |||||
return RT_FAILED; | return RT_FAILED; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -111,21 +112,17 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt | |||||
GE_CHECK_NOTNULL(var); | GE_CHECK_NOTNULL(var); | ||||
auto ret = VarManager::Instance(session_id)->GetVarAddr(var->GetName(), input_desc, &var_logic); | auto ret = VarManager::Instance(session_id)->GetVarAddr(var->GetName(), input_desc, &var_logic); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(INTERNAL_ERROR, | |||||
"Failed to copy var %s from device, can not find it" | |||||
" from var manager %u", | |||||
var->GetName().c_str(), ret); | |||||
GELOGE(INTERNAL_ERROR, "[Get][VarAddr] failed, node:%s, session_id:%lu, ret:%d", | |||||
var->GetName().c_str(), session_id, ret); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); | uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); | ||||
if (var_addr == nullptr) { | if (var_addr == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu,", | |||||
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu", | |||||
RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); | RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); | ||||
GELOGE(INTERNAL_ERROR, | |||||
"Failed to copy var %s from device, cant not get " | |||||
"var addr from logic addr %p", | |||||
var->GetName().c_str(), var_logic); | |||||
GELOGE(INTERNAL_ERROR, "[Get][VarMemoryAddr] failed, mem_type:%d, op:%s(%s), session_id:%lu", | |||||
RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -136,9 +133,10 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt | |||||
std::unique_ptr<uint8_t[]> var_host(new(std::nothrow) uint8_t[var_size_bytes]); | std::unique_ptr<uint8_t[]> var_host(new(std::nothrow) uint8_t[var_size_bytes]); | ||||
if (var_host == nullptr) { | if (var_host == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu,", | |||||
REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu", | |||||
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); | var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); | ||||
GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %ld", var_size_bytes); | |||||
GELOGE(OUT_OF_MEMORY, "[New][Memory] for rt-host failed, size:%ld, op:%s(%s), session_id:%lu", | |||||
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); | |||||
return OUT_OF_MEMORY; | return OUT_OF_MEMORY; | ||||
} | } | ||||
@@ -147,10 +145,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt | |||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", | REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", | ||||
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); | var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); | ||||
GELOGE(RT_FAILED, | |||||
"Failed to copy var memory from device, var %s, size %ld," | |||||
" rt-error-code %u", | |||||
var->GetName().c_str(), var_size_bytes, ret); | |||||
GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", | |||||
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); | |||||
return RT_FAILED; | return RT_FAILED; | ||||
} | } | ||||
@@ -197,9 +193,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats | |||||
formats::ShapeToString(src_shape).c_str(), | formats::ShapeToString(src_shape).c_str(), | ||||
formats::ShapeToString(dst_shape).c_str(), | formats::ShapeToString(dst_shape).c_str(), | ||||
TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); | TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); | ||||
GELOGE(INTERNAL_ERROR, | |||||
"Failed to trans format from %s to %s, shape %s to %s, " | |||||
"data type %s error code %u", | |||||
GELOGE(INTERNAL_ERROR, "[Trans][Format] from %s to %s, shape %s to %s failed, data type %s error code %u", | |||||
TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), | TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), | ||||
formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(), | formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(), | ||||
TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); | TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); | ||||
@@ -221,7 +215,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats | |||||
TypeUtils::DataTypeToSerialString(src_data_type).c_str(), | TypeUtils::DataTypeToSerialString(src_data_type).c_str(), | ||||
TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), | TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), | ||||
formats::ShapeToString(input_shape).c_str(), src_data_size, ret); | formats::ShapeToString(input_shape).c_str(), src_data_size, ret); | ||||
GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %ld, error code %u", | |||||
GELOGE(INTERNAL_ERROR, "[Trans][DataType] from %s to %s failed, input shape %s, data size %ld, error code %u", | |||||
TypeUtils::DataTypeToSerialString(src_data_type).c_str(), | TypeUtils::DataTypeToSerialString(src_data_type).c_str(), | ||||
TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), | TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), | ||||
src_data_size, ret); | src_data_size, ret); | ||||
@@ -230,7 +224,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats | |||||
} else { | } else { | ||||
REPORT_INNER_ERROR("E19999", "Trans var data failed, the trans type %s does not supported, check invalid", | REPORT_INNER_ERROR("E19999", "Trans var data failed, the trans type %s does not supported, check invalid", | ||||
trans_info.node_type.c_str()); | trans_info.node_type.c_str()); | ||||
GELOGE(UNSUPPORTED, "Failed to trans var data, the trans type %s does not supported", | |||||
GELOGE(UNSUPPORTED, "[Trans][VarData] failed, the trans type %s does not supported", | |||||
trans_info.node_type.c_str()); | trans_info.node_type.c_str()); | ||||
return UNSUPPORTED; | return UNSUPPORTED; | ||||
} | } | ||||
@@ -255,10 +249,8 @@ Status ReAssignVarAddr(uint64_t session_id, | |||||
uint8_t *var_logic = nullptr; | uint8_t *var_logic = nullptr; | ||||
Status ret = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &var_logic); | Status ret = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &var_logic); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(INTERNAL_ERROR, | |||||
"Failed to get var %s device addr, can not find it" | |||||
" from var manager %u", | |||||
var_name.c_str(), ret); | |||||
GELOGE(INTERNAL_ERROR, "[Get][VarAddr] failed, var name:%s, session_id:%lu, ret:%u", | |||||
var_name.c_str(), session_id, ret); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -266,7 +258,8 @@ Status ReAssignVarAddr(uint64_t session_id, | |||||
if (var_addr == nullptr) { | if (var_addr == nullptr) { | ||||
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, var_name:%s, session_id:%lu,", | REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, var_name:%s, session_id:%lu,", | ||||
RT_MEMORY_HBM, var_name.c_str(), session_id); | RT_MEMORY_HBM, var_name.c_str(), session_id); | ||||
GELOGE(INTERNAL_ERROR, "Failed to convert var %s logic addr to real addr", var_name.c_str()); | |||||
GELOGE(INTERNAL_ERROR, "[Get][VarMemoryAddr] failed, mem_type:%d, var_name:%s, session_id:%lu", | |||||
RT_MEMORY_HBM, var_name.c_str(), session_id); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
*var_device = var_addr; | *var_device = var_addr; | ||||
@@ -293,9 +286,8 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t | |||||
// Sync var data from device | // Sync var data from device | ||||
std::unique_ptr<uint8_t[]> var_data; | std::unique_ptr<uint8_t[]> var_data; | ||||
if (trans_road.empty()) { | if (trans_road.empty()) { | ||||
REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid", | |||||
session_id); | |||||
GELOGE(INTERNAL_ERROR, "Failed to get trans_road, trans_road is empty."); | |||||
REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid", session_id); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Param] trans_road is empty, session_id:%lu", session_id); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
const GeTensorDesc &input_desc = trans_road.begin()->input; | const GeTensorDesc &input_desc = trans_road.begin()->input; | ||||
@@ -307,7 +299,7 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t | |||||
formats::TransResult trans_result{}; | formats::TransResult trans_result{}; | ||||
ret = TransVarOnHost(var_data.get(), trans_road, trans_result); | ret = TransVarOnHost(var_data.get(), trans_road, trans_result); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Failed to trans var data on host, error code %u", ret); | |||||
GELOGE(ret, "[Call][TransVarOnHost] failed, session_id:%lu, ret:%u", session_id, ret); | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -319,14 +311,15 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t | |||||
/// TensorDesc needs to be removed. This change is large and needs to be performed step by step. | /// TensorDesc needs to be removed. This change is large and needs to be performed step by step. | ||||
ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device); | ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Failed to re-assign memory on device, size %zu", trans_result.length); | |||||
GELOGE(ret, "[Call][ReAssignVarAddr] failed, session id:%lu, op:%s, ret:%u", | |||||
session_id, var->GetName().c_str(), ret); | |||||
return ret; | return ret; | ||||
} | } | ||||
// sync new data to device | // sync new data to device | ||||
ret = CopyVarToDevice(var, trans_result, var_device); | ret = CopyVarToDevice(var, trans_result, var_device); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Failed to send var data to device"); | |||||
GELOGE(ret, "[Call][CopyVarToDevice] failed, var:%s, ret:%u", var->GetName().c_str(), ret); | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -350,7 +343,10 @@ Status TransTensor(uint8_t *var_data, const NodePtr &var_src, const NodePtr &var | |||||
TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), | TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), | ||||
TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), | TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), | ||||
src_data_shape_size, ret); | src_data_shape_size, ret); | ||||
GELOGE(INTERNAL_ERROR, "trans var data on host failed"); | |||||
GELOGE(INTERNAL_ERROR, "[Trans][DataType] from %s to %s failed, data size %ld, ret:%u", | |||||
TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), | |||||
TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), | |||||
src_data_shape_size, ret); | |||||
return ret; | return ret; | ||||
}); | }); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -366,9 +362,11 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src, | |||||
/// need copy value from var_fp32 to var_fp16. | /// need copy value from var_fp32 to var_fp16. | ||||
/// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr] | /// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr] | ||||
GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, | GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, | ||||
REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is empty, session_id:%lu, device_id:%u, " | |||||
REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is nullptr, session_id:%lu, device_id:%u, " | |||||
"check invalid", session_id, device_id); | "check invalid", session_id, device_id); | ||||
GELOGE(FAILED, "node var is nullptr"); return FAILED); | |||||
GELOGE(FAILED, "[Check][Param] Param var_src or var_dst is nullptr, session_id:%lu, device_id:%u", | |||||
session_id, device_id); | |||||
return FAILED); | |||||
// src_node output_desc (fp32) | // src_node output_desc (fp32) | ||||
GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0); | GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0); | ||||
auto src_data_type = output_desc.GetDataType(); | auto src_data_type = output_desc.GetDataType(); | ||||
@@ -390,31 +388,45 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src, | |||||
RtContextSwitchGuard switch_context(RT_CTX_NORMAL_MODE, device_id); | RtContextSwitchGuard switch_context(RT_CTX_NORMAL_MODE, device_id); | ||||
// copy from src_node | // copy from src_node | ||||
auto ret = CopyVarFromDevice(session_id, var_src, var_src_data, output_desc); | auto ret = CopyVarFromDevice(session_id, var_src, var_src_data, output_desc); | ||||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "Copy Var From Device failed"); return ret); | |||||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||||
GELOGE(FAILED, "[Call][CopyVarFromDevice] failed, session id:%lu, var_src:%s", | |||||
session_id, var_src->GetName().c_str()); | |||||
return ret); | |||||
// trans dtype | // trans dtype | ||||
formats::TransResult trans_result{}; | formats::TransResult trans_result{}; | ||||
ret = TransTensor(var_src_data.get(), var_src, var_dst, trans_result); | ret = TransTensor(var_src_data.get(), var_src, var_dst, trans_result); | ||||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "trans var data on host failed"); return ret); | |||||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||||
GELOGE(INTERNAL_ERROR, "[Trans][Tensor] failed, var_src:%s, var_dst:%s", | |||||
var_src->GetName().c_str(), var_dst->GetName().c_str()); | |||||
return ret); | |||||
// reset src value. | // reset src value. | ||||
void *var_device = nullptr; | void *var_device = nullptr; | ||||
ret = ReAssignVarAddr(session_id, var_dst->GetName(), dst_tensor_desc, &var_device); | ret = ReAssignVarAddr(session_id, var_dst->GetName(), dst_tensor_desc, &var_device); | ||||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "assign mem failed"); return ret); | |||||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||||
GELOGE(INTERNAL_ERROR, "[Call][ReAssignVarAddr] failed, session id:%lu, var_dst:%s", | |||||
session_id, var_dst->GetName().c_str()); | |||||
return ret); | |||||
// copy to device | // copy to device | ||||
ret = CopyVarToDevice(var_dst, trans_result, var_device); | ret = CopyVarToDevice(var_dst, trans_result, var_device); | ||||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Failed to send var data to device"); return ret); | |||||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||||
GELOGE(ret, "[Call][CopyVarToDevice] failed, var_dst:%s, ret:%u", | |||||
var_dst->GetName().c_str(), ret); | |||||
return ret); | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
} // namespace | } // namespace | ||||
Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, | Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, | ||||
uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id) { | uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id) { | ||||
GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "dst addr is null. "); | |||||
GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "[Check][Param] dst addr is nullptr."); | |||||
uint8_t *src_host_addr = nullptr; | uint8_t *src_host_addr = nullptr; | ||||
int64_t src_addr_size = 0; | int64_t src_addr_size = 0; | ||||
GE_MAKE_GUARD_RTMEM(src_host_addr); | GE_MAKE_GUARD_RTMEM(src_host_addr); | ||||
GE_CHK_STATUS_RET(SyncTensorToHost(var_name, src_tensor_desc, &src_host_addr, src_addr_size, session_id)); | GE_CHK_STATUS_RET(SyncTensorToHost(var_name, src_tensor_desc, &src_host_addr, src_addr_size, session_id)); | ||||
GELOGI("src_addr_size: %ld, dst_addr_size: %ld", src_addr_size, dst_addr_size); | GELOGI("src_addr_size: %ld, dst_addr_size: %ld", src_addr_size, dst_addr_size); | ||||
GE_CHK_BOOL_RET_STATUS(src_addr_size == dst_addr_size, FAILED, "var data size is not equal broadcast "); | |||||
GE_CHK_BOOL_RET_STATUS(src_addr_size == dst_addr_size, FAILED, | |||||
"[Check][Param] src_addr_size:%ld not equal to dst_addr_size:%ld", | |||||
src_addr_size, dst_addr_size); | |||||
GE_CHK_RT_RET(rtMemcpy(dst_addr, dst_addr_size, src_host_addr, src_addr_size, RT_MEMCPY_HOST_TO_DEVICE)); | GE_CHK_RT_RET(rtMemcpy(dst_addr, dst_addr_size, src_host_addr, src_addr_size, RT_MEMCPY_HOST_TO_DEVICE)); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -422,7 +434,7 @@ Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge | |||||
Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name, | Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name, | ||||
const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) { | const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) { | ||||
GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "src addr is null. "); | |||||
GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "[Check][Param] src addr is nullptr. "); | |||||
uint8_t *host_addr = nullptr; | uint8_t *host_addr = nullptr; | ||||
GE_MAKE_GUARD_RTMEM(host_addr); | GE_MAKE_GUARD_RTMEM(host_addr); | ||||
GE_CHK_RT_RET(rtMallocHost(reinterpret_cast<void **>(&host_addr), src_addr_size)); | GE_CHK_RT_RET(rtMallocHost(reinterpret_cast<void **>(&host_addr), src_addr_size)); | ||||
@@ -436,7 +448,7 @@ Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_a | |||||
Status TransVarDataUtils::SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, | Status TransVarDataUtils::SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, | ||||
uint8_t **host_addr, int64_t &src_tensor_size, uint64_t session_id) { | uint8_t **host_addr, int64_t &src_tensor_size, uint64_t session_id) { | ||||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "get size from TensorDesc failed"); | |||||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "[Get][Size] from TensorDesc failed"); | |||||
uint8_t *src_addr = nullptr; | uint8_t *src_addr = nullptr; | ||||
GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, src_tensor_desc, &src_addr)); | GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, src_tensor_desc, &src_addr)); | ||||
@@ -493,7 +505,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||||
if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, ret:0x%X,", | REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, ret:0x%X,", | ||||
session_id, graph_id, rt_ret); | session_id, graph_id, rt_ret); | ||||
GELOGE(RT_FAILED, "Failed to set context, error_code is: 0x%X.", rt_ret); | |||||
GELOGE(RT_FAILED, "[Call][RtCtxSetCurrent] failed, session_id:%lu, graph_id:%u, ret:0x%X,", | |||||
session_id, graph_id, rt_ret); | |||||
return RT_ERROR_TO_GE_STATUS(rt_ret); | return RT_ERROR_TO_GE_STATUS(rt_ret); | ||||
} | } | ||||
uint32_t allocated_graph_id = 0; | uint32_t allocated_graph_id = 0; | ||||
@@ -501,8 +514,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
REPORT_CALL_ERROR("E19999", "Get allocated GraphId failed, session_id:%lu, graph_id:%u, ret:0x%X,", | REPORT_CALL_ERROR("E19999", "Get allocated GraphId failed, session_id:%lu, graph_id:%u, ret:0x%X,", | ||||
session_id, graph_id, ret); | session_id, graph_id, ret); | ||||
GELOGE(INTERNAL_ERROR, "var has not been allocated, node:%s, graph_id:%u.", node->GetName().c_str(), | |||||
graph_id); | |||||
GELOGE(INTERNAL_ERROR, "[Get][AllocatedGraphId] failed, node:%s, graph_id:%u.", | |||||
node->GetName().c_str(), graph_id); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
uint32_t changed_graph_id = 0; | uint32_t changed_graph_id = 0; | ||||
@@ -518,7 +531,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||||
} | } | ||||
ret = TransVarData(node, *trans_road, session_id); | ret = TransVarData(node, *trans_road, session_id); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(INTERNAL_ERROR, "TransVarData failed, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id); | |||||
GELOGE(INTERNAL_ERROR, "[Trans][VarData] failed, node:%s, graph_id:%u, session_id:%lu.", | |||||
node->GetName().c_str(), graph_id, session_id); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
VarManager::Instance(session_id)->RemoveChangedGraphId(node->GetName()); | VarManager::Instance(session_id)->RemoveChangedGraphId(node->GetName()); | ||||
@@ -527,7 +541,7 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||||
}, | }, | ||||
node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorManagerContext()); | node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorManagerContext()); | ||||
if (!f.valid()) { | if (!f.valid()) { | ||||
GELOGE(FAILED, "Future is invalid"); | |||||
GELOGE(FAILED, "[Check][Param] Future is invalid, session id:%lu, graph id:%u", session_id, graph_id); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
vector_future.push_back(std::move(f)); | vector_future.push_back(std::move(f)); | ||||
@@ -537,7 +551,7 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||||
for (size_t i = 0; i < vector_future.size(); ++i) { | for (size_t i = 0; i < vector_future.size(); ++i) { | ||||
ret_status = vector_future[i].get(); | ret_status = vector_future[i].get(); | ||||
if (ret_status != SUCCESS) { | if (ret_status != SUCCESS) { | ||||
GELOGE(ret_status, "TransAllVarData:: trans %zu vardata failed", i); | |||||
GELOGE(ret_status, "[Check][Param] trans %zu vardata failed", i); | |||||
return ret_status; | return ret_status; | ||||
} | } | ||||
} | } | ||||
@@ -550,7 +564,8 @@ Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint | |||||
if (compute_graph == nullptr) { | if (compute_graph == nullptr) { | ||||
REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, session_id:%lu, device_id:%u, check invalid", | REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, session_id:%lu, device_id:%u, check invalid", | ||||
session_id, device_id); | session_id, device_id); | ||||
GELOGE(FAILED, "compute_graph is nullptr"); | |||||
GELOGE(FAILED, "[Check][Param] compute_graph is nullptr, session_id:%lu, device_id:%u", | |||||
session_id, device_id); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
@@ -568,7 +583,10 @@ Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint | |||||
GELOGI("current_var_node__: [%s] copy_from_var_node__: [%s].", node->GetName().c_str(), | GELOGI("current_var_node__: [%s] copy_from_var_node__: [%s].", node->GetName().c_str(), | ||||
src_node->GetName().c_str()); | src_node->GetName().c_str()); | ||||
auto ret = CopyTensorFromSrcVarNode(src_node, node, session_id, device_id); | auto ret = CopyTensorFromSrcVarNode(src_node, node, session_id, device_id); | ||||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "copy tensor failed!"); return FAILED); | |||||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||||
GELOGE(FAILED, "[Copy][Tensor] failed, src_node:%s, node:%s, session_id:%lu, device_id:%u", | |||||
src_node->GetName().c_str(), node->GetName().c_str(), session_id, device_id); | |||||
return FAILED); | |||||
// only copy once | // only copy once | ||||
(void) ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true); // no need to check value | (void) ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true); // no need to check value | ||||
} | } | ||||
@@ -63,17 +63,15 @@ Status Debug::DumpDevMem(const char *file, const void *addr, int64_t size) { | |||||
uint8_t *host_addr = nullptr; | uint8_t *host_addr = nullptr; | ||||
rtError_t ret = rtMallocHost(reinterpret_cast<void **>(&host_addr), size); | rtError_t ret = rtMallocHost(reinterpret_cast<void **>(&host_addr), size); | ||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret: 0x%X", | |||||
size, ret); | |||||
GELOGE(FAILED, "Call rt api rtMallocHost failed, ret: 0x%X", ret); | |||||
REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret:0x%X", size, ret); | |||||
GELOGE(FAILED, "[Call][RtMallocHost] failed, size:%zu, ret:0x%X", size, ret); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
GE_MAKE_GUARD_RTMEM(host_addr); | GE_MAKE_GUARD_RTMEM(host_addr); | ||||
ret = rtMemcpy(host_addr, size, addr, size, RT_MEMCPY_DEVICE_TO_HOST); | ret = rtMemcpy(host_addr, size, addr, size, RT_MEMCPY_DEVICE_TO_HOST); | ||||
if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", | |||||
size, ret); | |||||
GELOGE(FAILED, "Call rt api rtMemcpy failed, ret: 0x%X", ret); | |||||
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", size, ret); | |||||
GELOGE(FAILED, "[Call][RtMemcpy] failed, size:%zu, ret:0x%X", size, ret); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
@@ -28,7 +28,8 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, | |||||
std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) { | std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) { | ||||
GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { | if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid."); | |||||
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] failed, op:%s(%s).", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
GELOGI("GetHcclDataType start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); | GELOGI("GetHcclDataType start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
@@ -40,10 +41,10 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, | |||||
if (op_desc->GetType() == HCOMRECEIVE) { | if (op_desc->GetType() == HCOMRECEIVE) { | ||||
bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type); | bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type); | ||||
if (ret == false) { | if (ret == false) { | ||||
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", | |||||
HCOM_ATTR_DATA_TYPE.c_str(), | |||||
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", HCOM_ATTR_DATA_TYPE.c_str(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
GELOGE(PARAM_INVALID, "op:HcomReceive, op desc no attr: dtype."); | |||||
GELOGE(PARAM_INVALID, "[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_DATA_TYPE.c_str(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
} else { | } else { | ||||
@@ -55,13 +56,11 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, | |||||
auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type)); | auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type)); | ||||
if (iter == kConstOpHcclDataType.end()) { | if (iter == kConstOpHcclDataType.end()) { | ||||
REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value data_type:%s, not support in kConstOpHcclDataType now, " | REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value data_type:%s, not support in kConstOpHcclDataType now, " | ||||
"check invalid", HCOM_ATTR_DATA_TYPE.c_str(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), | |||||
ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||||
GELOGE(PARAM_INVALID, | |||||
"HcomOmeUtil:: Node: %s Optype: %s HcomDataType cann't support! Current Davinci Data Type : %s", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), | |||||
ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||||
"check invalid", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(), | |||||
op_desc->GetType().c_str(), ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in op:%s(%s), value data_type:%s, " | |||||
"not support in kConstOpHcclDataType now", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(), | |||||
op_desc->GetType().c_str(), ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -73,7 +72,7 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, | |||||
Status HcomOmeUtil::GetHcclTypeSize(HcclDataType data_type, int32_t &size) { | Status HcomOmeUtil::GetHcclTypeSize(HcclDataType data_type, int32_t &size) { | ||||
auto iter = kConstOpHcclDataTypeSize.find(data_type); | auto iter = kConstOpHcclDataTypeSize.find(data_type); | ||||
GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID, | GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID, | ||||
"HcomOmeUtil::HcomDataTypeSize , No DataTypeSize!"); | |||||
"[Check][Param] param data_type:%d not find", data_type); | |||||
size = iter->second; | size = iter->second; | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -83,21 +82,22 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType | |||||
int &count) { | int &count) { | ||||
GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
if (!IsHCOMOp(op_desc->GetType())) { | if (!IsHCOMOp(op_desc->GetType())) { | ||||
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Hcom operator."); | |||||
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid", op_desc->GetName().c_str(), | |||||
op_desc->GetType().c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Op:%s(%s) is not hcom op", op_desc->GetName().c_str(), | |||||
op_desc->GetType().c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
int64_t total_size = 0; | int64_t total_size = 0; | ||||
int64_t align_size = 512; | int64_t align_size = 512; | ||||
int32_t size = 0; | int32_t size = 0; | ||||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "GetHcomCount: GetHcclTypeSize fail!"); | |||||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "[Get][HcclTypeSize] fail, datatype:%d", data_type); | |||||
if (op_desc->GetType() == HCOMRECEIVE) { | if (op_desc->GetType() == HCOMRECEIVE) { | ||||
for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { | for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { | ||||
int64_t output_size = 0; | int64_t output_size = 0; | ||||
GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); | GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); | ||||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), output_size), | GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), output_size), | ||||
"Get size from TensorDesc failed, op: %s, output index: %zu.", op_desc->GetName().c_str(), i); | |||||
"[Get][Size] from TensorDesc failed, op:%s, output index:%zu.", op_desc->GetName().c_str(), i); | |||||
output_size = (output_size + align_size - 1) / align_size * align_size; | output_size = (output_size + align_size - 1) / align_size * align_size; | ||||
total_size += output_size; | total_size += output_size; | ||||
} | } | ||||
@@ -107,42 +107,48 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType | |||||
int64_t block_size = 0; | int64_t block_size = 0; | ||||
GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); | GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); | ||||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), | GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), | ||||
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); | |||||
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); | |||||
// dynamic shape hccl op get size from output tensor desc | // dynamic shape hccl op get size from output tensor desc | ||||
if (op_desc->HasAttr(ATTR_NAME_IS_UNKNOWN_SHAPE)) { | if (op_desc->HasAttr(ATTR_NAME_IS_UNKNOWN_SHAPE)) { | ||||
GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); | GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); | ||||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), input_size), | GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), input_size), | ||||
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); | |||||
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); | |||||
} | } | ||||
GE_IF_BOOL_EXEC( | GE_IF_BOOL_EXEC( | ||||
op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0; | op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0; | ||||
GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_RANK_SIZE, rank_size), PARAM_INVALID, | GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_RANK_SIZE, rank_size), PARAM_INVALID, | ||||
"get HCOM_ATTR_RANK_SIZE failed"); | |||||
GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "rank size is zero"); | |||||
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); GE_CHK_STATUS_RET( | |||||
ge::CheckInt64Uint32MulOverflow(shape_size, size), "Product of shape size and size beyond INT64_MAX"); | |||||
"[Get][Attr] %s in op:%s(%s) failed", HCOM_ATTR_RANK_SIZE.c_str(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "[Check][Param] rank size is zero"); | |||||
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); | |||||
GE_CHK_STATUS_RET(ge::CheckInt64Uint32MulOverflow(shape_size, size), | |||||
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX, op:%s(%s)", | |||||
shape_size, size, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
block_size = (shape_size * size) / rank_size; | block_size = (shape_size * size) / rank_size; | ||||
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); | |||||
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), | |||||
"[Check][Param] Total size:%ld is beyond the INT64_MAX, op:%s(%s)", | |||||
total_size, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
total_size = total_size + block_size; continue;); | total_size = total_size + block_size; continue;); | ||||
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); | int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); | ||||
GELOGD("hcom util node %s inputsize %ld, shapesize %ld, datasize %d.", | GELOGD("hcom util node %s inputsize %ld, shapesize %ld, datasize %d.", | ||||
op_desc->GetName().c_str(), input_size, shape_size, size); | op_desc->GetName().c_str(), input_size, shape_size, size); | ||||
GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), | GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), | ||||
"Product of shape size and size beyond INT64_MAX"); | |||||
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX", shape_size, size); | |||||
GE_IF_BOOL_EXEC(is_allgather, block_size = shape_size * size;); | GE_IF_BOOL_EXEC(is_allgather, block_size = shape_size * size;); | ||||
GE_IF_BOOL_EXEC(!is_allgather, block_size = (input_size + align_size - 1) / align_size * align_size;); | GE_IF_BOOL_EXEC(!is_allgather, block_size = (input_size + align_size - 1) / align_size * align_size;); | ||||
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); | |||||
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), | |||||
"[Check][Param] Total size:%ld is beyond the INT64_MAX", total_size); | |||||
total_size = total_size + block_size; | total_size = total_size + block_size; | ||||
} | } | ||||
} | } | ||||
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero"); | |||||
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "[Check][Param] Size is zero"); | |||||
count = static_cast<int>(total_size / size); | count = static_cast<int>(total_size / size); | ||||
GE_CHK_BOOL_EXEC(total_size % size == 0, return PARAM_INVALID, "total_size:%ld is not divisiable by size:%d.", | |||||
total_size, size); | |||||
GE_CHK_BOOL_EXEC(total_size % size == 0, return PARAM_INVALID, | |||||
"[Check][Param] total_size:%ld is not divisiable by size:%d.", total_size, size); | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -153,32 +159,34 @@ Status HcomOmeUtil::GetHorovodCount(const ge::ConstOpDescPtr &op_desc, | |||||
if (!IsHorovodOp(op_desc->GetType())) { | if (!IsHorovodOp(op_desc->GetType())) { | ||||
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not horovod op, check invalid", | REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not horovod op, check invalid", | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Horovod operator."); | |||||
GELOGE(PARAM_INVALID, "[Call][IsHorovodOp] failed, Op:%s(%s) is not horovod op", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
int64_t align_size = 512; | int64_t align_size = 512; | ||||
int32_t size = 0; | int32_t size = 0; | ||||
for (size_t i = 0; i < op_desc->GetInputsSize(); i++) { | for (size_t i = 0; i < op_desc->GetInputsSize(); i++) { | ||||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast<HcclDataType>(kernel_hccl_infos[i].dataType), size), | GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast<HcclDataType>(kernel_hccl_infos[i].dataType), size), | ||||
"GetHorovodCount: GetHcclTypeSize fail!"); | |||||
"[Call][GetHcclTypeSize] fail, op:%s(%s)", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
int64_t input_size = 0; | int64_t input_size = 0; | ||||
int64_t block_size = 0; | int64_t block_size = 0; | ||||
GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); | GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); | ||||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), | GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), | ||||
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); | |||||
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); | |||||
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); | int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); | ||||
GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), | GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), | ||||
"Product of shape size and size beyond INT64_MAX"); | |||||
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX", shape_size, size); | |||||
if (kernel_hccl_infos[0].hccl_type == HVDCALLBACKALLGATHER) { | if (kernel_hccl_infos[0].hccl_type == HVDCALLBACKALLGATHER) { | ||||
block_size = shape_size * size; | block_size = shape_size * size; | ||||
} else { | } else { | ||||
block_size = (input_size + align_size - 1) / align_size * align_size; | block_size = (input_size + align_size - 1) / align_size * align_size; | ||||
} | } | ||||
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero"); | |||||
GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID, "block_size:%ld is not divisiable by size:%d.", | |||||
block_size, size); | |||||
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "[Check][Param] Size is zero"); | |||||
GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID, | |||||
"[Check][Param] block_size:%ld is not divisiable by size:%d.", block_size, size); | |||||
kernel_hccl_infos[i].count = static_cast<int>(block_size / size); | kernel_hccl_infos[i].count = static_cast<int>(block_size / size); | ||||
} | } | ||||
@@ -191,7 +199,8 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, | |||||
Status ret; | Status ret; | ||||
ret = CheckKernelHcclInfo(op_desc, kernel_hccl_infos); | ret = CheckKernelHcclInfo(op_desc, kernel_hccl_infos); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid."); | |||||
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] failed, the number of GETaskKernelHcclInfo is invalid, op:%s(%s).", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); | GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
@@ -200,7 +209,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, | |||||
ret = GetHcomCount(op_desc, static_cast<HcclDataType>(kernel_hccl_infos[0].dataType), | ret = GetHcomCount(op_desc, static_cast<HcclDataType>(kernel_hccl_infos[0].dataType), | ||||
kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count); | kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "HcomOmeUtil:: Node: %s Optype: %s get the Hcom operator hccl count fail.", | |||||
GELOGE(ret, "[Call][GetHcomCount] Node:%s Optype:%s get the Hcom operator hccl count fail.", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -210,7 +219,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, | |||||
if (IsHorovodOp(op_desc->GetType())) { | if (IsHorovodOp(op_desc->GetType())) { | ||||
ret = GetHorovodCount(op_desc, kernel_hccl_infos); | ret = GetHorovodCount(op_desc, kernel_hccl_infos); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s get the Horovod hccl operator count fail.", | |||||
GELOGE(PARAM_INVALID, "[Call][GetHorovodCount] Node:%s Optype:%s get the Horovod hccl operator count fail.", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -225,11 +234,10 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl | |||||
if (IsHCOMOp(op_desc->GetType())) { | if (IsHCOMOp(op_desc->GetType())) { | ||||
std::string hcom_op_type; | std::string hcom_op_type; | ||||
GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), | GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), | ||||
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", | |||||
HCOM_ATTR_REDUCE_TYPE.c_str(), | |||||
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", HCOM_ATTR_REDUCE_TYPE.c_str(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
return PARAM_INVALID, | return PARAM_INVALID, | ||||
"HcomOmeUtil:: Node: %s Optype: %s Get HCOM_ATTR_REDUCE_TYPE fail, not support!", | |||||
"[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_REDUCE_TYPE.c_str(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
if (hcom_op_type == "min") { | if (hcom_op_type == "min") { | ||||
@@ -244,7 +252,9 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl | |||||
REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now, " | REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now, " | ||||
"check invalid", HCOM_ATTR_REDUCE_TYPE.c_str(), | "check invalid", HCOM_ATTR_REDUCE_TYPE.c_str(), | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), hcom_op_type.c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str(), hcom_op_type.c_str()); | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now", | |||||
HCOM_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), | |||||
op_desc->GetType().c_str(), hcom_op_type.c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
} | } | ||||
@@ -256,7 +266,7 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl | |||||
ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), | ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
return PARAM_INVALID, | return PARAM_INVALID, | ||||
"HcomOmeUtil:: Node: %s Optype: %s Get ATTR_HOROVOD_ATTR_REDUCE_TYPE fail, not support!", | |||||
"[Get][Attr] %s in op:%s(%s) fail", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
auto iter = kHorovodRedOpToHcclRedOp.find(static_cast<HorovodReduceOp>(horovod_op_type)); | auto iter = kHorovodRedOpToHcclRedOp.find(static_cast<HorovodReduceOp>(horovod_op_type)); | ||||
@@ -264,8 +274,8 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl | |||||
REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now, " | REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now, " | ||||
"check invalid", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), | "check invalid", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); | op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s HcomOpType cann't support! Current HcomOpType : %ld", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now", | |||||
ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
op_type = iter->second; | op_type = iter->second; | ||||
@@ -281,7 +291,7 @@ Status HcomOmeUtil::GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &ro | |||||
HCOM_ATTR_ROOT_RANK.c_str(), | HCOM_ATTR_ROOT_RANK.c_str(), | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
return PARAM_INVALID, | return PARAM_INVALID, | ||||
"HcomOmeUtil::Node %s Optype: %s Get HCOM_ATTR_ROOT_INDEX fail, not support!", | |||||
"[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_ROOT_RANK.c_str(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -296,7 +306,7 @@ Status HcomOmeUtil::GetAllRootId(const ge::ConstOpDescPtr &op_desc, | |||||
int64_t root_id = 0; | int64_t root_id = 0; | ||||
Status dmrt = GetHcclRootId(op_desc, root_id); | Status dmrt = GetHcclRootId(op_desc, root_id); | ||||
if (dmrt != SUCCESS) { | if (dmrt != SUCCESS) { | ||||
GELOGE(FAILED, "davinci_model: GetHcomRootId fail! domi error: %u", dmrt); | |||||
GELOGE(FAILED, "[Get][HcclRootId] fail! domi error: %u", dmrt); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
@@ -324,7 +334,8 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc, | |||||
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1, " | REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1, " | ||||
"check invalid", | "check invalid", | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); | op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Hcom scenario, the number of GETaskKernelHcclInfo is invalid."); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -337,7 +348,9 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc, | |||||
"in op:%s(%s), check invalid", | "in op:%s(%s), check invalid", | ||||
kernel_hccl_infos.size(), op_desc->GetInputsSize(), | kernel_hccl_infos.size(), op_desc->GetInputsSize(), | ||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Horovod scenario, the number of GETaskKernelHcclInfo is invalid."); | |||||
GELOGE(PARAM_INVALID, "Param kernel_hccl_infos.size:%zu is empty or not equal to " | |||||
"input_desc size:%zu in op:%s(%s)", kernel_hccl_infos.size(), op_desc->GetInputsSize(), | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
} | } | ||||
@@ -360,7 +373,7 @@ Status HcomOmeUtil::GetHorovodInputs(const ge::ConstOpDescPtr &op_desc, | |||||
} | } | ||||
if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { | if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { | ||||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s the number of GETaskKernelHcclInfo is invalid.", | |||||
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] Node:%s Optype:%s the number of GETaskKernelHcclInfo is invalid.", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | op_desc->GetName().c_str(), op_desc->GetType().c_str()); | ||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -54,7 +54,7 @@ void VarAccelerateCtrl::SetVarChanged(const std::string &var_name) { | |||||
void VarAccelerateCtrl::AddGraph(uint32_t graph_id, const ComputeGraphPtr &compute_graph) { | void VarAccelerateCtrl::AddGraph(uint32_t graph_id, const ComputeGraphPtr &compute_graph) { | ||||
std::lock_guard<std::mutex> lock(mutex_); | std::lock_guard<std::mutex> lock(mutex_); | ||||
if (compute_graph == nullptr) { | if (compute_graph == nullptr) { | ||||
GELOGE(PARAM_INVALID, "Failed to add graph %u, the compute graph is null", graph_id); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] Failed to add graph %u, the compute graph is null", graph_id); | |||||
return; | return; | ||||
} | } | ||||
auto &var_names = graph_ids_to_var_names_[graph_id]; | auto &var_names = graph_ids_to_var_names_[graph_id]; | ||||
@@ -253,8 +253,7 @@ bool CheckDynamicImagesizeInputShapeValid(map<string, vector<int64_t>> shape_map | |||||
for (auto str : split_set) { | for (auto str : split_set) { | ||||
split_dim = StringUtils::Split(str, ','); | split_dim = StringUtils::Split(str, ','); | ||||
if (split_dim.size() != static_cast<size_t>(kDynamicImageSizeNum)) { | if (split_dim.size() != static_cast<size_t>(kDynamicImageSizeNum)) { | ||||
ErrorManager::GetInstance().ATCReportErrMessage("E10020", {"DynamicImageSizeNum"}, | |||||
{std::to_string(kDynamicImageSizeNum)}); | |||||
ErrorManager::GetInstance().ATCReportErrMessage("E10020"); | |||||
GELOGE(ge::PARAM_INVALID, | GELOGE(ge::PARAM_INVALID, | ||||
"[Check][DynamicImagesizeInputShape] invalid value:%s number of dimensions of each group must be %ld.", | "[Check][DynamicImagesizeInputShape] invalid value:%s number of dimensions of each group must be %ld.", | ||||
dynamic_image_size.c_str(), kDynamicImageSizeNum); | dynamic_image_size.c_str(), kDynamicImageSizeNum); | ||||