Browse Source

!1638 modify geloge message

Merge pull request !1638 from ldy2021/master
tags/v1.3.0
计晨 Gitee 3 years ago
parent
commit
e64cbac7c8
25 changed files with 819 additions and 780 deletions
  1. +26
    -15
      ge/common/model_parser/model_parser.cc
  2. +30
    -30
      ge/graph/load/graph_loader.cc
  3. +179
    -190
      ge/graph/load/model_manager/model_manager.cc
  4. +1
    -1
      ge/graph/load/model_manager/model_manager.h
  5. +14
    -14
      ge/graph/load/model_manager/model_utils.cc
  6. +8
    -9
      ge/graph/load/model_manager/tbe_handle_store.cc
  7. +4
    -4
      ge/graph/load/model_manager/ts_mem_mall.h
  8. +18
    -6
      ge/graph/load/model_manager/zero_copy_offset.cc
  9. +5
    -6
      ge/graph/load/model_manager/zero_copy_task.cc
  10. +16
    -23
      ge/graph/manager/graph_caching_allocator.cc
  11. +10
    -9
      ge/graph/manager/graph_context.cc
  12. +267
    -263
      ge/graph/manager/graph_manager.cc
  13. +9
    -6
      ge/graph/manager/graph_manager_utils.cc
  14. +6
    -12
      ge/graph/manager/graph_mem_allocator.cc
  15. +49
    -36
      ge/graph/manager/graph_var_manager.cc
  16. +4
    -4
      ge/graph/manager/host_mem_allocator.cc
  17. +9
    -13
      ge/graph/manager/host_mem_manager.cc
  18. +8
    -10
      ge/graph/manager/memory_api.cc
  19. +1
    -1
      ge/graph/manager/model_manager/event_manager.cc
  20. +10
    -11
      ge/graph/manager/rdma_pool_allocator.cc
  21. +75
    -57
      ge/graph/manager/trans_var_data_utils.cc
  22. +4
    -6
      ge/graph/manager/util/debug.cc
  23. +64
    -51
      ge/graph/manager/util/hcom_util.cc
  24. +1
    -1
      ge/graph/manager/util/variable_accelerate_ctrl.cc
  25. +1
    -2
      ge/ir_build/option_utils.cc

+ 26
- 15
ge/common/model_parser/model_parser.cc View File

@@ -62,7 +62,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::LoadFro


char *data = new (std::nothrow) char[len]; char *data = new (std::nothrow) char[len];
if (data == nullptr) { if (data == nullptr) {
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Load model From file failed, bad memory allocation occur. (need:%u)", len);
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Load][ModelFromFile]Failed, " GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Load][ModelFromFile]Failed, "
"bad memory allocation occur(need %u), file %s", len, model_path); "bad memory allocation occur(need %u), file %s", len, model_path);
REPORT_CALL_ERROR("E19999", "Load model from file %s failed, " REPORT_CALL_ERROR("E19999", "Load model from file %s failed, "
@@ -90,33 +89,45 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::ParseMo
GE_CHECK_NOTNULL(model.model_data); GE_CHECK_NOTNULL(model.model_data);


// Model length too small // Model length too small
GE_CHK_BOOL_RET_STATUS(model.model_len >= sizeof(ModelFileHeader), ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID,
"Invalid model. Model data size %u must be greater than or equal to %zu.", model.model_len,
sizeof(ModelFileHeader));
GE_CHK_BOOL_EXEC(model.model_len >= sizeof(ModelFileHeader),
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}),
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"}));
GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID,
"[Check][Param] Invalid model. Model data size %u must be greater than or equal to %zu.",
model.model_len, sizeof(ModelFileHeader));
return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;);
// Get file header // Get file header
auto file_header = reinterpret_cast<ModelFileHeader *>(model.model_data); auto file_header = reinterpret_cast<ModelFileHeader *>(model.model_data);
// Determine whether the file length and magic number match // Determine whether the file length and magic number match
GE_CHK_BOOL_RET_STATUS(
file_header->length == model.model_len - sizeof(ModelFileHeader) && file_header->magic == MODEL_FILE_MAGIC_NUM,
ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID,
"Invalid model. file_header->length[%u] + sizeof(ModelFileHeader)[%zu] != model->model_len[%u] || "
"MODEL_FILE_MAGIC_NUM[%u] != file_header->magic[%u]",
file_header->length, sizeof(ModelFileHeader), model.model_len, MODEL_FILE_MAGIC_NUM, file_header->magic);

GE_CHK_BOOL_EXEC(file_header->length == model.model_len - sizeof(ModelFileHeader) &&
file_header->magic == MODEL_FILE_MAGIC_NUM,
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}),
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"}));
GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID,
"[Check][Param] Invalid model, file_header->length[%u] + sizeof(ModelFileHeader)[%zu] != "
"model->model_len[%u] || MODEL_FILE_MAGIC_NUM[%u] != file_header->magic[%u]",
file_header->length, sizeof(ModelFileHeader), model.model_len,
MODEL_FILE_MAGIC_NUM, file_header->magic);
return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;);
Status res = SUCCESS; Status res = SUCCESS;


// Get data address // Get data address
uint8_t *data = reinterpret_cast<uint8_t *>(model.model_data) + sizeof(ModelFileHeader); uint8_t *data = reinterpret_cast<uint8_t *>(model.model_data) + sizeof(ModelFileHeader);
if (file_header->is_encrypt == ModelEncryptType::UNENCRYPTED) { // Unencrypted model if (file_header->is_encrypt == ModelEncryptType::UNENCRYPTED) { // Unencrypted model
GE_CHK_BOOL_RET_STATUS(model.key.empty(), ACL_ERROR_GE_PARAM_INVALID,
"Invalid param. model is unencrypted, but key is not empty.");

if (!model.key.empty()) {
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}),
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"}));
GELOGE(ACL_ERROR_GE_PARAM_INVALID,
"[Check][Param] Invalid param, model is unencrypted, but key is not empty.");
return ACL_ERROR_GE_PARAM_INVALID;
}
model_data = data; model_data = data;
model_len = file_header->length; model_len = file_header->length;
GELOGD("Model_len is %u, model_file_head_len is %zu.", model_len, sizeof(ModelFileHeader)); GELOGD("Model_len is %u, model_file_head_len is %zu.", model_len, sizeof(ModelFileHeader));
} else { } else {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param]Invalid, model encrypt type not supported"); GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param]Invalid, model encrypt type not supported");
REPORT_CALL_ERROR("E19999","Invalid model, encrypt type not supported");
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}),
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"}));
res = ACL_ERROR_GE_PARAM_INVALID; res = ACL_ERROR_GE_PARAM_INVALID;
} }




+ 30
- 30
ge/graph/load/graph_loader.cc View File

@@ -33,12 +33,12 @@ Status GraphLoader::UnloadModel(uint32_t model_id) {


Status ret = model_manager->Stop(model_id); Status ret = model_manager->Stop(model_id);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "UnloadModel: Stop failed. model id:%u", model_id);
GELOGE(ret, "[Stop][Model] failed. model id:%u", model_id);
} }


ret = model_manager->Unload(model_id); ret = model_manager->Unload(model_id);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "UnloadModel: Unload failed. model id:%u", model_id);
GELOGE(ret, "[Unload][Model] failed. model id:%u", model_id);
return ret; return ret;
} }
GELOGI("UnLoad model success, model id:%u.", model_id); GELOGI("UnLoad model success, model id:%u.", model_id);
@@ -50,14 +50,13 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge
GELOGI("Load model online begin."); GELOGI("Load model online begin.");
rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); rtError_t rt_ret = rtSetDevice(GetContext().DeviceId());
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret);
return RT_FAILED; return RT_FAILED;
} }
if (ge_root_model_ptr == nullptr) { if (ge_root_model_ptr == nullptr) {
REPORT_INNER_ERROR("E19999", "Check param ge_root_model_ptr nullptr, check invalid"); REPORT_INNER_ERROR("E19999", "Check param ge_root_model_ptr nullptr, check invalid");
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[LoadGraph] GE load graph model_ptr is nullptr.");
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[LoadGraph][Check][Param] GE load graph model_ptr is nullptr.");
return GE_GRAPH_PARAM_NULLPTR; return GE_GRAPH_PARAM_NULLPTR;
} }


@@ -65,12 +64,12 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge
GE_CHECK_NOTNULL(model_manager); GE_CHECK_NOTNULL(model_manager);
Status ret = model_manager->LoadModelOnline(model_id, ge_root_model_ptr, listener); Status ret = model_manager->LoadModelOnline(model_id, ge_root_model_ptr, listener);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "LoadModel: Load failed. ret = %u", ret);
GELOGE(ret, "[Load][Model] Online failed. ret = %u, model_id:%u", ret, model_id);
rt_ret = rtDeviceReset(GetContext().DeviceId()); rt_ret = rtDeviceReset(GetContext().DeviceId());
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret); GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret);
} }
return ret; return ret;
} }
@@ -81,31 +80,31 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret); GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret);
} }
return SUCCESS; return SUCCESS;
} }
ret = model_manager->Start(model_id); ret = model_manager->Start(model_id);
if (ret != SUCCESS) { if (ret != SUCCESS) {
if (model_manager->Unload(model_id) != SUCCESS) { if (model_manager->Unload(model_id) != SUCCESS) {
GELOGE(ret, "LoadModel: Unload failed while trying to unload after a failed start.");
GELOGE(ret, "[Unload][Model] failed while trying to unload after a failed start, model_id:%u.", model_id);
} }


rt_ret = rtDeviceReset(GetContext().DeviceId()); rt_ret = rtDeviceReset(GetContext().DeviceId());
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret); GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret);
} }


GELOGE(ret, "LoadModel: Start failed.");
GELOGE(ret, "[Start][Model] failed, model_id:%u.", model_id);
return ret; return ret;
} }
rt_ret = rtDeviceReset(GetContext().DeviceId()); rt_ret = rtDeviceReset(GetContext().DeviceId());
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret); GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret);
return RT_FAILED; return RT_FAILED;
} }
GELOGI("Load model online success, model_id:%u.", model_id); GELOGI("Load model online success, model_id:%u.", model_id);
@@ -118,7 +117,7 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) {
GE_CHECK_NOTNULL(model_manager); GE_CHECK_NOTNULL(model_manager);
Status ret = model_manager->GetMaxUsedMemory(model_id, max_size); Status ret = model_manager->GetMaxUsedMemory(model_id, max_size);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "GetMaxUsedMemory: GetMaxUsedMemory failed.");
GELOGE(ret, "[Call][GetMaxUsedMemory] failed, model_id:%u.", model_id);
return ret; return ret;
} }
return SUCCESS; return SUCCESS;
@@ -127,21 +126,20 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) {
Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string &key_path, int32_t priority, Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string &key_path, int32_t priority,
ModelData &model_data) { ModelData &model_data) {
if (!CheckInputPathValid(path)) { if (!CheckInputPathValid(path)) {
GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str());
GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "[Check][Param] model path is invalid:%s", path.c_str());
return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID; return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID;
} }


GELOGI("Load model begin, model path is: %s", path.c_str()); GELOGI("Load model begin, model path is: %s", path.c_str());
if (!key_path.empty() && !CheckInputPathValid(key_path)) { if (!key_path.empty() && !CheckInputPathValid(key_path)) {
REPORT_INNER_ERROR("E19999", "Param key_path:%s empty or invalid",
key_path.c_str());
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "decrypt_key path is invalid: %s", key_path.c_str());
REPORT_INNER_ERROR("E19999", "Param key_path:%s empty or invalid", key_path.c_str());
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param] decrypt_key path is invalid:%s", key_path.c_str());
return ACL_ERROR_GE_PARAM_INVALID; return ACL_ERROR_GE_PARAM_INVALID;
} }


Status ret = ModelParserBase::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data); Status ret = ModelParserBase::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret);
GELOGE(ret, "[Call][LoadFromFile] failed. ret = %u, path:%s, key path:%s", ret, path.c_str(), key_path.c_str());
if (model_data.model_data != nullptr) { if (model_data.model_data != nullptr) {
delete[] static_cast<char *>(model_data.model_data); delete[] static_cast<char *>(model_data.model_data);
model_data.model_data = nullptr; model_data.model_data = nullptr;
@@ -156,18 +154,19 @@ Status GraphLoader::CommandHandle(const Command &command) {
GE_CHECK_NOTNULL(model_manager); GE_CHECK_NOTNULL(model_manager);
Status ret = model_manager->HandleCommand(command); Status ret = model_manager->HandleCommand(command);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "CommandHandle: Command Handle failed.");
GELOGE(ret, "[Handle][Command] failed, module_index:%lu.", command.module_index);


return ret; return ret;
} }
} catch (std::bad_alloc &) { } catch (std::bad_alloc &) {
REPORT_INNER_ERROR("E19999", "Bad memory allocation occur"); REPORT_INNER_ERROR("E19999", "Bad memory allocation occur");
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Command handle failed, bad memory allocation occur !");
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Handle][Command] failed, "
"bad memory allocation occur, module_index:%lu.", command.module_index);


return ACL_ERROR_GE_MEMORY_ALLOCATION; return ACL_ERROR_GE_MEMORY_ALLOCATION;
} catch (...) { } catch (...) {
REPORT_INNER_ERROR("E19999", "Some exceptions occur"); REPORT_INNER_ERROR("E19999", "Some exceptions occur");
GELOGE(FAILED, "Command handle failed, some exceptions occur !");
GELOGE(FAILED, "[Handle][Command] failed, some exceptions occur, module_index:%lu.", command.module_index);


return FAILED; return FAILED;
} }
@@ -184,7 +183,7 @@ Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model
Status ret = model_manager->LoadModelOffline( Status ret = model_manager->LoadModelOffline(
model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size); model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "Load model failed, model_id:%u.", model_id);
GELOGE(ret, "[Load][Model] failed, model_id:%u.", model_id);
return ret; return ret;
} }
GELOGI("Load model success, model_id:%u.", model_id); GELOGI("Load model success, model_id:%u.", model_id);
@@ -210,7 +209,7 @@ Status GraphLoader::LoadModelWithQ(uint32_t &model_id, const ModelData &model_da
GE_CHECK_NOTNULL(model_manager); GE_CHECK_NOTNULL(model_manager);
Status ret = model_manager->LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids); Status ret = model_manager->LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "Load model with queue failed, model_id:%u.", model_id);
GELOGE(ret, "[Load][Model] with queue failed, model_id:%u.", model_id);
return ret; return ret;
} }


@@ -237,7 +236,7 @@ Status GraphLoader::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asyn
Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, Status ret = model_manager->ExecuteModel(model_id, stream, async_mode,
input_data, input_desc, output_data, output_desc); input_data, input_desc, output_data, output_desc);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "Execute model failed, model_id:%u.", model_id);
GELOGE(ret, "[Execute][Model] failed, model_id:%u.", model_id);
return ret; return ret;
} }


@@ -250,7 +249,7 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) {
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret); GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret);
return RT_FAILED; return RT_FAILED;
} }
size_t total_mem = 0; size_t total_mem = 0;
@@ -258,14 +257,14 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) {
rt_ret = rtMemGetInfo(&free_mem, &total_mem); rt_ret = rtMemGetInfo(&free_mem, &total_mem);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtMemGetInfo failed, ret:0x%X", rt_ret); REPORT_CALL_ERROR("E19999", "Call rtMemGetInfo failed, ret:0x%X", rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtMemGetInfo] failed, ret:0x%X", rt_ret);
return RT_FAILED; return RT_FAILED;
} }
rt_ret = rtDeviceReset(GetContext().DeviceId()); rt_ret = rtDeviceReset(GetContext().DeviceId());
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret); GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret);
return RT_FAILED; return RT_FAILED;
} }
// Add small page memory size // Add small page memory size
@@ -280,7 +279,8 @@ Status GraphLoader::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id, u
GE_CHECK_NOTNULL(model_manager); GE_CHECK_NOTNULL(model_manager);
Status ret = model_manager->DestroyAicpuKernel(session_id, model_id, sub_model_id); Status ret = model_manager->DestroyAicpuKernel(session_id, model_id, sub_model_id);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "Destroy aicpu kernel failed.");
GELOGE(ret, "[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u.",
session_id, model_id, sub_model_id);
return ret; return ret;
} }
return SUCCESS; return SUCCESS;
@@ -291,7 +291,7 @@ Status GraphLoader::DestroyAicpuSessionForInfer(uint32_t model_id) {
GE_CHECK_NOTNULL(model_manager); GE_CHECK_NOTNULL(model_manager);
Status ret = model_manager->DestroyAicpuSessionForInfer(model_id); Status ret = model_manager->DestroyAicpuSessionForInfer(model_id);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "Destroy aicpu serrion for infer failed.");
GELOGE(ret, "[Call][DestroyAicpuSessionForInfer] failed, model_id:%u.", model_id);
return ret; return ret;
} }
return SUCCESS; return SUCCESS;


+ 179
- 190
ge/graph/load/model_manager/model_manager.cc
File diff suppressed because it is too large
View File


+ 1
- 1
ge/graph/load/model_manager/model_manager.h View File

@@ -310,7 +310,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
std::lock_guard<std::mutex> lock(exeception_infos_mutex_); std::lock_guard<std::mutex> lock(exeception_infos_mutex_);
auto instance = ModelManager::GetInstance(); auto instance = ModelManager::GetInstance();
if (instance == nullptr) { if (instance == nullptr) {
GELOGE(FAILED, "Instance is nullptr");
GELOGE(FAILED, "[Get][Instance] failed, as ret is nullptr");
return; return;
} }
instance->AddExceptionInfo(*rt_exception_info); instance->AddExceptionInfo(*rt_exception_info);


+ 14
- 14
ge/graph/load/model_manager/model_utils.cc View File

@@ -26,10 +26,10 @@
#define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \ #define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \
do { \ do { \
if (SIZE <= static_cast<uint64_t>(OFFSET)) { \ if (SIZE <= static_cast<uint64_t>(OFFSET)) { \
REPORT_INNER_ERROR("E19999", \
"Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \
REPORT_INNER_ERROR("E19999", "Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \
OP->GetName().c_str(), OP->GetType().c_str(), OFFSET, SIZE); \ OP->GetName().c_str(), OP->GetType().c_str(), OFFSET, SIZE); \
GELOGE(OUT_OF_MEMORY, "Node: %s, memory out of range[%lu: %ld]", OP->GetName().c_str(), SIZE, OFFSET); \
GELOGE(OUT_OF_MEMORY, "[Check][Param]Node: %s, memory out of range[%lu: %ld]", \
OP->GetName().c_str(), SIZE, OFFSET); \
return {}; \ return {}; \
} \ } \
} while (0) } while (0)
@@ -312,8 +312,9 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s), check invalid", REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s), check invalid",
ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size,
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
GELOGE(PARAM_INVALID, "Fusion: check input size failed, op: %s, input v_memory_type size: %zu input numbers: %zu",
op_desc->GetName().c_str(), v_memory_type.size(), inputs_size);
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s)",
ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size,
op_desc->GetName().c_str(), op_desc->GetType().c_str());
return v_input_data_addr; return v_input_data_addr;
} }
for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) {
@@ -392,8 +393,7 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc
case RT_MEMORY_RDMA_HBM: case RT_MEMORY_RDMA_HBM:
if (offset < 0) { if (offset < 0) {
REPORT_INNER_ERROR("E19999", "Param offset:%ld < 0, check invalid", offset); REPORT_INNER_ERROR("E19999", "Param offset:%ld < 0, check invalid", offset);
GELOGE(PARAM_INVALID, "rdma var addr is invalid, addr=%p",
reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(offset)));
GELOGE(PARAM_INVALID, "[Check][Param] Param offset:%ld cannot be negative", offset);
return PARAM_INVALID; return PARAM_INVALID;
} }
var_addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(offset)); var_addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(offset));
@@ -403,9 +403,9 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc
var_addr = model_param.var_base + offset - model_param.logic_var_base; var_addr = model_param.var_base + offset - model_param.logic_var_base;
break; break;
default: default:
REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid",
mem_type, offset);
GELOGE(PARAM_INVALID, "unsupported memory type %u", mem_type);
REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid", mem_type, offset);
GELOGE(PARAM_INVALID, "[Check][Param] Get mem_type:%d for offset:%ld is unsupported, check invalid",
mem_type, offset);
return PARAM_INVALID; return PARAM_INVALID;
} }
GE_CHECK_NOTNULL(var_addr); GE_CHECK_NOTNULL(var_addr);
@@ -433,9 +433,9 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s), check invalid", REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s), check invalid",
ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size,
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
GELOGE(PARAM_INVALID,
"Fusion: check output size failed, op: %s, output v_memory_type size: %lu output numbers: %zu",
op_desc->GetName().c_str(), v_memory_type.size(), outputs_size);
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s)",
ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size,
op_desc->GetName().c_str(), op_desc->GetType().c_str());
return v_output_data_addr; return v_output_data_addr;
} }
for (size_t i = 0; i < outputs_size; ++i) { for (size_t i = 0; i < outputs_size; ++i) {
@@ -594,7 +594,7 @@ Status ModelUtils::GetRtAddress(const RuntimeParam &param, uintptr_t logic_addr,
} else if (logic_addr != 0) { } else if (logic_addr != 0) {
mem_addr = nullptr; mem_addr = nullptr;
REPORT_INNER_ERROR("E19999", "Check param logic addr:0x%lx abnormal", logic_addr); REPORT_INNER_ERROR("E19999", "Check param logic addr:0x%lx abnormal", logic_addr);
GELOGE(PARAM_INVALID, "The logic addr:0x%lx is abnormal", logic_addr);
GELOGE(PARAM_INVALID, "[Check][Param] The logic addr:0x%lx is abnormal", logic_addr);
return PARAM_INVALID; return PARAM_INVALID;
} }




+ 8
- 9
ge/graph/load/model_manager/tbe_handle_store.cc View File

@@ -24,7 +24,7 @@ namespace ge {
void TbeHandleInfo::used_inc(uint32_t num) { void TbeHandleInfo::used_inc(uint32_t num) {
if (used_ > std::numeric_limits<uint32_t>::max() - num) { if (used_ > std::numeric_limits<uint32_t>::max() - num) {
REPORT_INNER_ERROR("E19999", "Used:%u reach numeric max", used_); REPORT_INNER_ERROR("E19999", "Used:%u reach numeric max", used_);
GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric max.", used_);
GELOGE(INTERNAL_ERROR, "[Check][Param] Used[%u] reach numeric max.", used_);
return; return;
} }


@@ -34,7 +34,7 @@ void TbeHandleInfo::used_inc(uint32_t num) {
void TbeHandleInfo::used_dec(uint32_t num) { void TbeHandleInfo::used_dec(uint32_t num) {
if (used_ < std::numeric_limits<uint32_t>::min() + num) { if (used_ < std::numeric_limits<uint32_t>::min() + num) {
REPORT_INNER_ERROR("E19999", "Used:%u reach numeric min", used_); REPORT_INNER_ERROR("E19999", "Used:%u reach numeric min", used_);
GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric min.", used_);
GELOGE(INTERNAL_ERROR, "[Check][Param] Used[%u] reach numeric min.", used_);
return; return;
} }


@@ -107,9 +107,8 @@ void TBEHandleStore::ReferTBEHandle(const std::string &name) {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto it = kernels_.find(name); auto it = kernels_.find(name);
if (it == kernels_.end()) { if (it == kernels_.end()) {
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid",
name.c_str());
GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", name.c_str());
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", name.c_str());
GELOGE(INTERNAL_ERROR, "[Check][Param] Kernel[%s] not found in stored.", name.c_str());
return; return;
} }


@@ -128,9 +127,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map<std::string, uint32_t> &names
for (auto &item : names) { for (auto &item : names) {
auto it = kernels_.find(item.first); auto it = kernels_.find(item.first);
if (it == kernels_.end()) { if (it == kernels_.end()) {
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid",
item.first.c_str());
GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", item.first.c_str());
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", item.first.c_str());
GELOGE(INTERNAL_ERROR, "[Check][Param] Kernel[%s] not found in stored.", item.first.c_str());
continue; continue;
} }


@@ -142,7 +140,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map<std::string, uint32_t> &names
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_INNER_ERROR("E19999", "Call rtDevBinaryUnRegister failed for Kernel:%s fail, ret:0x%X", REPORT_INNER_ERROR("E19999", "Call rtDevBinaryUnRegister failed for Kernel:%s fail, ret:0x%X",
item.first.c_str(), rt_ret); item.first.c_str(), rt_ret);
GELOGE(INTERNAL_ERROR, "Kernel[%s] UnRegister handle fail:%u.", item.first.c_str(), rt_ret);
GELOGE(INTERNAL_ERROR, "[Call][RtDevBinaryUnRegister] Kernel[%s] UnRegister handle fail:%u.",
item.first.c_str(), rt_ret);
} }
kernels_.erase(it); kernels_.erase(it);
} }


+ 4
- 4
ge/graph/load/model_manager/ts_mem_mall.h View File

@@ -43,7 +43,7 @@ class TsMemMall {
for (auto it : mem_store_size_) { for (auto it : mem_store_size_) {
rtError_t ret = rtFree(it.second); rtError_t ret = rtFree(it.second);
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret);
GELOGE(RT_FAILED, "[Call][RtFree] failed, ret:0x%X", ret);
} }
} }
mem_store_size_.clear(); mem_store_size_.clear();
@@ -52,7 +52,7 @@ class TsMemMall {


void *Acquire(int64_t offset, uint64_t size) { void *Acquire(int64_t offset, uint64_t size) {
if (size == 0) { if (size == 0) {
GELOGE(RT_FAILED, "Acquire mem block failed, size: %lu", size);
GELOGE(RT_FAILED, "[Check][Param] Acquire mem block failed, size:%lu", size);
return nullptr; return nullptr;
} }


@@ -71,7 +71,7 @@ class TsMemMall {
void *addr = nullptr; void *addr = nullptr;
rtError_t rt_ret = rtMalloc(&addr, bytes, mem_type_); rtError_t rt_ret = rtMalloc(&addr, bytes, mem_type_);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtMalloc] failed, size:%lu, ret:0x%X", bytes, rt_ret);
return nullptr; return nullptr;
} }


@@ -94,7 +94,7 @@ class TsMemMall {
mem_store_addr_.erase(it); mem_store_addr_.erase(it);
rtError_t ret = rtFree(addr); rtError_t ret = rtFree(addr);
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret);
GELOGE(RT_FAILED, "[Call][RtFree] failed, ret:0x%X", ret);
} }
} }




+ 18
- 6
ge/graph/load/model_manager/zero_copy_offset.cc View File

@@ -38,8 +38,13 @@ Status ZeroCopyOffset::InitInputDataInfo(int64_t output_size, void *virtual_addr
op_name_ = op_desc->GetName(); op_name_ = op_desc->GetName();
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_);
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_);
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID,
"basic_offset_size should be equal to relative_offset_size");
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(),
REPORT_INNER_ERROR("E19999", "basic_offset_size:%zu not equal to relative_offset_size:%zu, "
"check invalid", zero_copy_basic_offset_.size(),
zero_copy_relative_offset_.size());
return PARAM_INVALID,
"[Check][Param] basic_offset_size:%zu should be equal to relative_offset_size:%zu",
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size());
GELOGD("[ZCPY] zero_copy_basic_offset size is %zu", zero_copy_basic_offset_.size()); GELOGD("[ZCPY] zero_copy_basic_offset size is %zu", zero_copy_basic_offset_.size());


int64_t virtual_addr_offset = op_desc->GetOutputOffset().at(kDataIndex); int64_t virtual_addr_offset = op_desc->GetOutputOffset().at(kDataIndex);
@@ -78,7 +83,8 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector<int64_t> &input_size_list
if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, size) != GRAPH_SUCCESS) { if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, size) != GRAPH_SUCCESS) {
REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu", REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx);
GELOGE(FAILED, "GetTensorSizeInBytes failed!");
GELOGE(FAILED, "[Get][InputTensorSize] in op:%s(%s) failed, input_index:%zu",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx);
return FAILED; return FAILED;
} }


@@ -88,8 +94,13 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector<int64_t> &input_size_list
op_name_ = op_desc->GetName(); op_name_ = op_desc->GetName();
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_);
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_);
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID,
"basic_offset_size should be equal to relative_offset_size");
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(),
REPORT_INNER_ERROR("E19999", "basic_offset_size:%zu not equal to relative_offset_size:%zu, "
"check invalid",
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size());
return PARAM_INVALID,
"[Check][Param] basic_offset_size:%zu should be equal to relative_offset_size:%zu",
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size());
int64_t virtual_addr_offset = op_desc->GetInputOffset().at(idx); int64_t virtual_addr_offset = op_desc->GetInputOffset().at(idx);
IsL2Fusion(zero_copy_basic_offset_, virtual_addr_offset, fusion_flag); IsL2Fusion(zero_copy_basic_offset_, virtual_addr_offset, fusion_flag);


@@ -194,7 +205,8 @@ void ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *ou
for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) { for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) {
auto args_addrs = outside_addrs_[out_count].find(outside_addr); auto args_addrs = outside_addrs_[out_count].find(outside_addr);
if (args_addrs != outside_addrs_[out_count].end()) { if (args_addrs != outside_addrs_[out_count].end()) {
GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), "Input args invalid.");
GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset),
"[Set][TaskArgsOffset] failed, Input args invalid, offset:%zu.", offset);
void *args_val = static_cast<uint8_t *>(args) + offset; void *args_val = static_cast<uint8_t *>(args) + offset;
args_addrs->second.push_back(args_val); args_addrs->second.push_back(args_val);
GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val, GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val,


+ 5
- 6
ge/graph/load/model_manager/zero_copy_task.cc View File

@@ -36,9 +36,9 @@ ZeroCopyTask::~ZeroCopyTask() { args_addr_ = nullptr; }
*/ */
Status ZeroCopyTask::SetTaskArgsOffset(uintptr_t addr, size_t offset) { Status ZeroCopyTask::SetTaskArgsOffset(uintptr_t addr, size_t offset) {
if (offset + sizeof(uintptr_t) > args_size_) { if (offset + sizeof(uintptr_t) > args_size_) {
REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid",
offset, args_size_);
GELOGE(FAILED, "[ZCPY] %s set task args failed, args size: %zu, offset: %zu", name_.c_str(), args_size_, offset);
REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid", offset, args_size_);
GELOGE(FAILED, "[Check][Param] [ZCPY] %s set task args failed, args size:%zu, offset:%zu",
name_.c_str(), args_size_, offset);
return FAILED; // unexpected error, need fix. return FAILED; // unexpected error, need fix.
} }


@@ -118,9 +118,8 @@ Status ZeroCopyTask::DistributeParam(bool async_mode, rtStream_t stream) {
} }


if (rt_err != RT_ERROR_NONE) { if (rt_err != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret: 0x%X",
args_size_, rt_err);
GELOGE(RT_FAILED, "[ZCPY] %s distribute task param failed, error=0x%x", name_.c_str(), rt_err);
REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret:0x%X", args_size_, rt_err);
GELOGE(RT_FAILED, "[Distribute][TaskParam] for %s failed, error = 0x%x", name_.c_str(), rt_err);
return RT_ERROR_TO_GE_STATUS(rt_err); return RT_ERROR_TO_GE_STATUS(rt_err);
} }




+ 16
- 23
ge/graph/manager/graph_caching_allocator.cc View File

@@ -112,7 +112,7 @@ Status CachingAllocator::Initialize(uint32_t device_id) {
auto bin_ptr = new (std::nothrow) BlockBin(BlockComparator); auto bin_ptr = new (std::nothrow) BlockBin(BlockComparator);
if (bin_ptr == nullptr) { if (bin_ptr == nullptr) {
REPORT_CALL_ERROR("E19999", "New BlockBin fail, device_id:%u", device_id); REPORT_CALL_ERROR("E19999", "New BlockBin fail, device_id:%u", device_id);
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc BlockBin failed.");
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Alloc][BlockBin] failed, device_id:%u", device_id);
return ACL_ERROR_GE_MEMORY_ALLOCATION; return ACL_ERROR_GE_MEMORY_ALLOCATION;
} }
free_block_bins_[i] = bin_ptr; free_block_bins_[i] = bin_ptr;
@@ -147,9 +147,8 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device
ptr = block->ptr; ptr = block->ptr;
} }
if (ptr == nullptr) { if (ptr == nullptr) {
REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u",
size, device_id);
GELOGE(FAILED, "Malloc failed device id = %u, size= %zu", device_id, size);
REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u", size, device_id);
GELOGE(FAILED, "[Check][Param] FindFreeBlock failed device id = %u, size= %zu", device_id, size);
} }
return ptr; return ptr;
} }
@@ -157,18 +156,16 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device
Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) { Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) {
GELOGI("Free device id = %u", device_id); GELOGI("Free device id = %u", device_id);
if (ptr == nullptr) { if (ptr == nullptr) {
REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid",
device_id);
GELOGE(PARAM_INVALID, "Invalid memory pointer");
REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid", device_id);
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer, device_id:%u", device_id);
return ge::PARAM_INVALID; return ge::PARAM_INVALID;
} }


std::lock_guard<std::recursive_mutex> lock(mutex_); std::lock_guard<std::recursive_mutex> lock(mutex_);
auto it = allocated_blocks_.find(ptr); auto it = allocated_blocks_.find(ptr);
if (it == allocated_blocks_.end()) { if (it == allocated_blocks_.end()) {
REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid",
device_id);
GELOGE(PARAM_INVALID, "Invalid memory pointer: %p", ptr);
REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid", device_id);
GELOGE(PARAM_INVALID, "[Check][Param] Param ptr not allocated before, device_id:%u", device_id);
return ge::PARAM_INVALID; return ge::PARAM_INVALID;
} }
Block *block = it->second; Block *block = it->second;
@@ -225,9 +222,8 @@ Block *CachingAllocator::FindFreeBlock(size_t size, uint8_t *org_ptr, uint32_t d
Block key(device_id, size, org_ptr); Block key(device_id, size, org_ptr);
BlockBin *bin = GetBlockBin(size); BlockBin *bin = GetBlockBin(size);
if (bin == nullptr) { if (bin == nullptr) {
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u",
size, device_id);
GELOGE(ge::FAILED, "Get block bin failed size = %zu", size);
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", size, device_id);
GELOGE(ge::FAILED, "[Get][BlockBin] failed, size:%zu, device_id:%u", size, device_id);
return nullptr; return nullptr;
} }
std::lock_guard<std::recursive_mutex> lock(mutex_); std::lock_guard<std::recursive_mutex> lock(mutex_);
@@ -258,9 +254,8 @@ Block *CachingAllocator::SplitBlock(Block *block, size_t size, BlockBin &bin, ui
Block *remaining = block; Block *remaining = block;
Block *new_block = new (std::nothrow) Block(device_id, size, &bin, block->ptr); Block *new_block = new (std::nothrow) Block(device_id, size, &bin, block->ptr);
if (new_block == nullptr) { if (new_block == nullptr) {
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u",
size, device_id);
GELOGE(ge::FAILED, "Alloc block failed size = %zu", size);
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", size, device_id);
GELOGE(ge::FAILED, "[Alloc][Block] failed, size:%zu, device_id:%u", size, device_id);
return block; return block;
} }
new_block->prev = remaining->prev; new_block->prev = remaining->prev;
@@ -285,7 +280,7 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) {
size_t free_cached_memory_size = FreeCachedBlocks(); size_t free_cached_memory_size = FreeCachedBlocks();
memory_addr = memory_allocator_->MallocMemory(purpose, memory_size, device_id); memory_addr = memory_allocator_->MallocMemory(purpose, memory_size, device_id);
if (memory_addr == nullptr) { if (memory_addr == nullptr) {
GELOGE(ge::FAILED, "TryExtendCache failed, no enough memory for size = %zu, device_id = %u", memory_size,
GELOGE(ge::FAILED, "[Malloc][Memory] failed, no enough memory for size = %zu, device_id = %u", memory_size,
device_id); device_id);
return ge::FAILED; return ge::FAILED;
} }
@@ -304,16 +299,14 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) {
Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id) { Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id) {
BlockBin *bin = GetBlockBin(size); BlockBin *bin = GetBlockBin(size);
if (bin == nullptr) { if (bin == nullptr) {
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u",
size, device_id);
GELOGE(ge::FAILED, "Get block bin failed size = %zu", size);
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", size, device_id);
GELOGE(ge::FAILED, "[Get][BlockBin] failed, size:%zu, device_id:%u", size, device_id);
return ge::FAILED; return ge::FAILED;
} }
Block *block = new (std::nothrow) Block(device_id, size, bin, nullptr); Block *block = new (std::nothrow) Block(device_id, size, bin, nullptr);
if (block == nullptr) { if (block == nullptr) {
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u",
size, device_id);
GELOGE(ge::FAILED, "Alloc block failed size = %zu", size);
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", size, device_id);
GELOGE(ge::FAILED, "[Alloc][Block] failed, size:%zu, device_id:%u", size, device_id);
return ge::FAILED; return ge::FAILED;
} }




+ 10
- 9
ge/graph/manager/graph_context.cc View File

@@ -33,7 +33,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) {
if (compute_graph_ == nullptr) { if (compute_graph_ == nullptr) {
std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph(); std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph();
if (graph == nullptr) { if (graph == nullptr) {
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!");
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[Get][Graph] failed, compute_graph by graphNode is NULL!");
return; return;
} }


@@ -45,7 +45,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) {
Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) {
if (graph_node == nullptr) { if (graph_node == nullptr) {
REPORT_INNER_ERROR("E19999", "Param graph_node is nullptr, check invalid"); REPORT_INNER_ERROR("E19999", "Param graph_node is nullptr, check invalid");
GELOGE(GE_GRAPH_PARAM_NULLPTR, "graphNode is NULL!");
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] graphNode is NULL!");
return GE_GRAPH_PARAM_NULLPTR; return GE_GRAPH_PARAM_NULLPTR;
} }


@@ -56,7 +56,7 @@ Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) {
std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph(); std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph();
if (graph == nullptr) { if (graph == nullptr) {
REPORT_INNER_ERROR("E19999", "Param graph in graph_node is nullptr, check invalid"); REPORT_INNER_ERROR("E19999", "Param graph in graph_node is nullptr, check invalid");
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!");
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[Get][Graph] failed, compute_graph by graphNode is NULL!");
return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL;
} }


@@ -73,14 +73,15 @@ Status GraphContext::Finalize() const { return SUCCESS; }
Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTensor &returned_tensor) { Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTensor &returned_tensor) {
if (var_data_name.empty()) { if (var_data_name.empty()) {
REPORT_INNER_ERROR("E19999", "Param var_data_name is empty, check invalid"); REPORT_INNER_ERROR("E19999", "Param var_data_name is empty, check invalid");
GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "Variable data name is empty!");
GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "[Check][Param] Variable data name is empty!");
return GE_GRAPH_EMPTY_STRING_NAME; return GE_GRAPH_EMPTY_STRING_NAME;
} }


if (GetVarNodeTensorTable().empty()) { if (GetVarNodeTensorTable().empty()) {
REPORT_INNER_ERROR("E19999", "VarNodeTensorTable is empty, var_data_name:%s, check invalid", REPORT_INNER_ERROR("E19999", "VarNodeTensorTable is empty, var_data_name:%s, check invalid",
var_data_name.c_str()); var_data_name.c_str());
GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "VarNodeTensorTable is empty!");
GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "[Check][Param] VarNodeTensorTable is empty, var_data_name:%s",
var_data_name.c_str());
return GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE; return GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE;
} }
for (auto &var_record : GetVarNodeTensorTable()) { for (auto &var_record : GetVarNodeTensorTable()) {
@@ -88,9 +89,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso
returned_tensor.SetTensorDesc(var_record.second.GetTensorDesc()); returned_tensor.SetTensorDesc(var_record.second.GetTensorDesc());
auto ret = returned_tensor.SetData(var_record.second.GetData()); auto ret = returned_tensor.SetData(var_record.second.GetData());
if (ret != SUCCESS) { if (ret != SUCCESS) {
REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s",
var_data_name.c_str());
GELOGE(ret, "Set Tensor data failed!");
REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s", var_data_name.c_str());
GELOGE(ret, "[Set][Data] to Tensor failed, var_data_name:%s", var_data_name.c_str());
return ret; return ret;
} }


@@ -100,7 +100,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso


REPORT_INNER_ERROR("E19999", "VarRecord with data_name:%s does not exist, check invalid", REPORT_INNER_ERROR("E19999", "VarRecord with data_name:%s does not exist, check invalid",
var_data_name.c_str()); var_data_name.c_str());
GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "VarRecord with data_name %s does NOT exist!", var_data_name.c_str());
GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "[Check][Param] VarRecord with data_name %s does NOT exist!",
var_data_name.c_str());


return GE_GRAPH_VARIABLE_DOES_NOT_EXIST; return GE_GRAPH_VARIABLE_DOES_NOT_EXIST;
} }


+ 267
- 263
ge/graph/manager/graph_manager.cc
File diff suppressed because it is too large
View File


+ 9
- 6
ge/graph/manager/graph_manager_utils.cc View File

@@ -46,7 +46,7 @@ GraphNode::GraphNode(GraphId graph_id)
sem_(1) { sem_(1) {
graph_run_async_listener_ = MakeShared<RunAsyncListener>(); graph_run_async_listener_ = MakeShared<RunAsyncListener>();
if (graph_run_async_listener_ == nullptr) { if (graph_run_async_listener_ == nullptr) {
GELOGE(MEMALLOC_FAILED, "Make shared failed");
GELOGE(MEMALLOC_FAILED, "[New][RunAsyncListener] failed");
} }
} }


@@ -82,7 +82,8 @@ SubGraphInfo::~SubGraphInfo() {
rt_ret = rtFreeHost(buffer_addr); rt_ret = rtFreeHost(buffer_addr);
buffer_addr = nullptr; buffer_addr = nullptr;
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id);
GELOGE(rt_ret, "[Call][RtFreeHost] subgraph free buffer failed, modelId = %u",
model_id_info_.model_id);
} }
} }
} }
@@ -94,8 +95,8 @@ Status SubGraphInfo::FreeInOutBuffer() {
rtError_t rt_ret; rtError_t rt_ret;
rt_ret = rtFreeHost(*iter); rt_ret = rtFreeHost(*iter);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail");
GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id);
REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail, ret:%d", rt_ret);
GELOGE(rt_ret, "[Call][RtFreeHost] subgraph free buffer failed, modelId = %u", model_id_info_.model_id);
buffer_addr_.erase(buffer_addr_.begin(), iter); buffer_addr_.erase(buffer_addr_.begin(), iter);
return GE_GRAPH_FREE_FAILED; return GE_GRAPH_FREE_FAILED;
} }
@@ -131,7 +132,7 @@ Status GraphModelListener::OnComputeDone(uint32_t model_id, uint32_t task_id, ui
uint32_t GraphModelListener::GetResultCode() const { uint32_t GraphModelListener::GetResultCode() const {
if (!is_finished_) { if (!is_finished_) {
REPORT_CALL_ERROR("E19999", "Model not run finish"); REPORT_CALL_ERROR("E19999", "Model not run finish");
GELOGE(INTERNAL_ERROR, "[GraphManager] model not run finish.");
GELOGE(INTERNAL_ERROR, "[Check][Param] model not run finish.");
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
return result_code_; return result_code_;
@@ -170,7 +171,9 @@ bool HasCalcOp(const ComputeGraphPtr &graph) {


for (const auto &node : graph->GetAllNodes()) { for (const auto &node : graph->GetAllNodes()) {
OpDescPtr op_desc = node->GetOpDesc(); OpDescPtr op_desc = node->GetOpDesc();
GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(FAILED, "Node GetOpDesc is nullptr"); return false);
GE_IF_BOOL_EXEC(op_desc == nullptr,
REPORT_INNER_ERROR("E19999", "GetOpDesc failed, Node GetOpDesc is nullptr");
GELOGE(FAILED, "[Get][OpDesc] failed, Node GetOpDesc is nullptr"); return false);
if (calc_op_type.find(op_desc->GetType()) != calc_op_type.end()) { if (calc_op_type.find(op_desc->GetType()) != calc_op_type.end()) {
return true; return true;
} }


+ 6
- 12
ge/graph/manager/graph_mem_allocator.cc View File

@@ -50,9 +50,7 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, size_t memory_size
if (rtMalloc(reinterpret_cast<void **>(&memory_addr), memory_size, memory_type_) != RT_ERROR_NONE) { if (rtMalloc(reinterpret_cast<void **>(&memory_addr), memory_size, memory_type_) != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, purpose:%s, size:%zu, device_id:%u", REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, purpose:%s, size:%zu, device_id:%u",
purpose.c_str(), memory_size, device_id); purpose.c_str(), memory_size, device_id);
GELOGE(ge::INTERNAL_ERROR,
"MemoryAllocator::MallocMemory device_id = %u,"
" size= %lu",
GELOGE(ge::INTERNAL_ERROR, "[Malloc][Memory] failed, device_id = %u, size= %lu",
device_id, memory_size); device_id, memory_size);


return nullptr; return nullptr;
@@ -68,7 +66,7 @@ Status MemoryAllocator::FreeMemory(uint8_t *memory_addr, uint32_t device_id) con
auto rtRet = rtFree(memory_addr); auto rtRet = rtFree(memory_addr);
if (rtRet != RT_ERROR_NONE) { if (rtRet != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtFree fail, device_id:%u", device_id); REPORT_CALL_ERROR("E19999", "Call rtFree fail, device_id:%u", device_id);
GELOGE(rtRet, "MemoryAllocator::MallocMemory device_id = %u", device_id);
GELOGE(rtRet, "[Call][RtFree] failed, device_id = %u", device_id);
return RT_ERROR_TO_GE_STATUS(rtRet); return RT_ERROR_TO_GE_STATUS(rtRet);
} }
memory_addr = nullptr; memory_addr = nullptr;
@@ -88,10 +86,8 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, const string &memo
if (memory_addr == nullptr) { if (memory_addr == nullptr) {
REPORT_CALL_ERROR("E19999", "Malloc Memory fail, purpose:%s, memory_key:%s, memory_size:%zu, device_id:%u", REPORT_CALL_ERROR("E19999", "Malloc Memory fail, purpose:%s, memory_key:%s, memory_size:%zu, device_id:%u",
purpose.c_str(), memory_key.c_str(), memory_size, device_id); purpose.c_str(), memory_key.c_str(), memory_size, device_id);
GELOGE(ge::INTERNAL_ERROR,
"MemoryAllocator::MallocMemory failed,"
" memory_key[%s], size = %lu.",
memory_key.c_str(), memory_size);
GELOGE(ge::INTERNAL_ERROR, "[Malloc][Memory] failed, memory_key[%s], size = %lu, device_id:%u.",
memory_key.c_str(), memory_size, device_id);
return nullptr; return nullptr;
} }


@@ -126,10 +122,8 @@ Status MemoryAllocator::FreeMemory(const string &memory_key, uint32_t device_id)
if (FreeMemory(it->second.memory_addr_, device_id) != ge::SUCCESS) { if (FreeMemory(it->second.memory_addr_, device_id) != ge::SUCCESS) {
REPORT_CALL_ERROR("E19999", "Free Memory fail, memory_key:%s, device_id:%u", REPORT_CALL_ERROR("E19999", "Free Memory fail, memory_key:%s, device_id:%u",
memory_key.c_str(), device_id); memory_key.c_str(), device_id);
GELOGE(ge::INTERNAL_ERROR,
"MemoryAllocator::FreeMemory rtFree failed,"
" memory_key[%s]",
memory_key.c_str());
GELOGE(ge::INTERNAL_ERROR, "[Free][Memory] failed, memory_key[%s], device_id:%u",
memory_key.c_str(), device_id);
return ge::INTERNAL_ERROR; return ge::INTERNAL_ERROR;
} }




+ 49
- 36
ge/graph/manager/graph_var_manager.cc View File

@@ -40,7 +40,8 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens
if (dev_ptr == nullptr) { if (dev_ptr == nullptr) {
REPORT_INNER_ERROR("E19999", "Param dev_ptr is nullptr, var_name:%s, session_id:%lu, " REPORT_INNER_ERROR("E19999", "Param dev_ptr is nullptr, var_name:%s, session_id:%lu, "
"check invalid", var_name.c_str(), session_id_); "check invalid", var_name.c_str(), session_id_);
GELOGE(FAILED, "[GetVarAddr] dev_ptr is null!");
GELOGE(FAILED, "[Check][Param] Param dev_ptr is nullptr, var_name:%s, session_id:%lu",
var_name.c_str(), session_id_);
return FAILED; return FAILED;
} }
std::string var_key = VarKey(var_name, tensor_desc); std::string var_key = VarKey(var_name, tensor_desc);
@@ -51,7 +52,8 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens
REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, " REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, "
"check invalid", var_key.c_str(), var_name.c_str(), "check invalid", var_key.c_str(), var_name.c_str(),
session_id_); session_id_);
GELOGE(FAILED, "VarResource::GetVarAddr failed, var_key %s", var_key.c_str());
GELOGE(FAILED, "[Check][Param] var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu",
var_key.c_str(), var_name.c_str(), session_id_);
return FAILED; return FAILED;
} }


@@ -109,7 +111,8 @@ ge::Status VarResource::SaveVarAddr(const std::string &var_name, const ge::GeTen
REPORT_INNER_ERROR("E19999", "var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu, " REPORT_INNER_ERROR("E19999", "var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu, "
"check invalid", var_key.c_str(), var_name.c_str(), "check invalid", var_key.c_str(), var_name.c_str(),
session_id_); session_id_);
GELOGE(FAILED, "VarResource::SaveVarAddr, var_key %s save addr conflict", var_key.c_str());
GELOGE(FAILED, "[Check][Param] var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu",
var_key.c_str(), var_name.c_str(), session_id_);
return FAILED; return FAILED;
} }


@@ -145,14 +148,15 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O
if (op_desc == nullptr) { if (op_desc == nullptr) {
REPORT_INNER_ERROR("E19999", "Param op_desc is nullptr, var_name:%s, session_id:%lu, check invalid", REPORT_INNER_ERROR("E19999", "Param op_desc is nullptr, var_name:%s, session_id:%lu, check invalid",
var_name.c_str(), session_id_); var_name.c_str(), session_id_);
GELOGE(FAILED, "[RenewCurVarDesc] renew var desc fail! input opdesc is null!");
GELOGE(FAILED, "[Check][Param] input opdesc is nullptr, var_name:%s, session_id:%lu",
var_name.c_str(), session_id_);
return FAILED; return FAILED;
} }


ge::GeTensorDesc curr_desc; ge::GeTensorDesc curr_desc;
ge::Status ret = GetCurVarDesc(var_name, curr_desc); ge::Status ret = GetCurVarDesc(var_name, curr_desc);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(FAILED, "[RenewCurVarDesc] Get var desc fail!");
GELOGE(FAILED, "[Get][CurVarDesc] fail, var_name:%s, session_id:%lu", var_name.c_str(), session_id_);
return FAILED; return FAILED;
} }
std::string key = VarKey(var_name, curr_desc); std::string key = VarKey(var_name, curr_desc);
@@ -164,7 +168,8 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O
REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s), " REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s), "
"check invalid", key.c_str(), var_name.c_str(), "check invalid", key.c_str(), var_name.c_str(),
session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str());
GELOGE(FAILED, "[RenewCurVarDesc] can't find ele with key [%s]", key.c_str());
GELOGE(FAILED, "[Check][Param] var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s)",
key.c_str(), var_name.c_str(), session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str());
return FAILED; return FAILED;
} }
auto val = iter->second; auto val = iter->second;
@@ -285,14 +290,15 @@ Status HbmMemResource::AssignVarMem(const std::string &var_name, uint64_t size,
if (total_size_ < var_mem_size_) { if (total_size_ < var_mem_size_) {
REPORT_INNER_ERROR("E19999", "VarMemMaxSize:%lu < var_mem_size_:%lu, var_size:%lu, var_name:%s, check invalid" REPORT_INNER_ERROR("E19999", "VarMemMaxSize:%lu < var_mem_size_:%lu, var_size:%lu, var_name:%s, check invalid"
"", total_size_, var_mem_size_, size, var_name.c_str()); "", total_size_, var_mem_size_, size, var_name.c_str());
GELOGE(PARAM_INVALID, "total_size_: %lu is smaller than var_mem_size_: %lu", total_size_, var_mem_size_);
GELOGE(PARAM_INVALID, "[Check][Param] total_size_:%lu is smaller than var_mem_size_:%lu, var_name:%s",
total_size_, var_mem_size_, var_name.c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
uint64_t free_size = total_size_ - var_mem_size_; uint64_t free_size = total_size_ - var_mem_size_;
if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) { if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) {
REPORT_INNER_ERROR("E19999", "free_size:%lu not enough, var_align_size:%lu, var_name:%s, check invalid", REPORT_INNER_ERROR("E19999", "free_size:%lu not enough, var_align_size:%lu, var_name:%s, check invalid",
free_size, size, var_name.c_str()); free_size, size, var_name.c_str());
GELOGE(PARAM_INVALID, "Out of memory : current var size[%lu] exceeds total var size[%lu]",
GELOGE(PARAM_INVALID, "[Check][Param] Out of memory: current var size[%lu] exceeds total var size[%lu]",
size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_); size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_);
return PARAM_INVALID; return PARAM_INVALID;
} }
@@ -317,7 +323,7 @@ Status RdmaMemResource::AssignVarMem(const std::string &var_name, uint64_t size,
if (buffer == nullptr) { if (buffer == nullptr) {
REPORT_CALL_ERROR("E19999", "malloc rdma memory fail, var_size:%lu, var_name:%s", REPORT_CALL_ERROR("E19999", "malloc rdma memory fail, var_size:%lu, var_name:%s",
size, var_name.c_str()); size, var_name.c_str());
GELOGE(MEMALLOC_FAILED, "Failed to malloc rdma memory for node %s, size = %lu", var_name.c_str(), size);
GELOGE(MEMALLOC_FAILED, "[Malloc][RdmaMemory] for node %s failed, size = %lu", var_name.c_str(), size);
return MEMALLOC_FAILED; return MEMALLOC_FAILED;
} }
address = static_cast<size_t>(reinterpret_cast<uintptr_t>(buffer)); address = static_cast<size_t>(reinterpret_cast<uintptr_t>(buffer));
@@ -468,7 +474,8 @@ int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) {
if (mem_resource == nullptr) { if (mem_resource == nullptr) {
REPORT_INNER_ERROR("E19999", "Find no mem_resource in map, memory_type:%d, session_id:%lu", REPORT_INNER_ERROR("E19999", "Find no mem_resource in map, memory_type:%d, session_id:%lu",
memory_type, session_id_); memory_type, session_id_);
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid.");
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%d, session_id:%lu",
memory_type, session_id_);
return 0; return 0;
} }
return mem_resource->GetVarMemSize(); return mem_resource->GetVarMemSize();
@@ -483,7 +490,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) {
if (mem_resource == nullptr) { if (mem_resource == nullptr) {
REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu",
memory_type, session_id_); memory_type, session_id_);
GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type);
GELOGE(ge::INTERNAL_ERROR, "[Alloc][MemResource] failed, memory_type:%u, session_id:%lu",
memory_type, session_id_);
return ge::INTERNAL_ERROR; return ge::INTERNAL_ERROR;
} else { } else {
mem_resource_map_[memory_type] = mem_resource; mem_resource_map_[memory_type] = mem_resource;
@@ -495,7 +503,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) {
if (mem_resource == nullptr) { if (mem_resource == nullptr) {
REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu",
memory_type, session_id_); memory_type, session_id_);
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid.");
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%u, session_id:%lu",
memory_type, session_id_);
return FAILED; return FAILED;
} }
mem_resource->UpdateVarMemSize(mem_size); mem_resource->UpdateVarMemSize(mem_size);
@@ -515,7 +524,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen
if (result != ge::SUCCESS) { if (result != ge::SUCCESS) {
REPORT_CALL_ERROR("E19999", "Get size from tensor fail, var_name:%s, memory_type:%d, session_id:%lu", REPORT_CALL_ERROR("E19999", "Get size from tensor fail, var_name:%s, memory_type:%d, session_id:%lu",
var_name.c_str(), memory_type, session_id_); var_name.c_str(), memory_type, session_id_);
GELOGE(result, "get size from TensorDesc failed");
GELOGE(result, "[Get][Size] from tensor fail, var_name:%s, memory_type:%u, session_id:%lu",
var_name.c_str(), memory_type, session_id_);
return result; return result;
} }


@@ -526,7 +536,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen
if (mem_resource == nullptr) { if (mem_resource == nullptr) {
REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu",
memory_type, session_id_); memory_type, session_id_);
GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type);
GELOGE(ge::INTERNAL_ERROR, "[Alloc][MemResource] failed, memory_type:%u, session_id:%lu.",
memory_type, session_id_);
return ge::INTERNAL_ERROR; return ge::INTERNAL_ERROR;
} else { } else {
mem_resource_map_[memory_type] = mem_resource; mem_resource_map_[memory_type] = mem_resource;
@@ -538,7 +549,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen
if (mem_resource == nullptr) { if (mem_resource == nullptr) {
REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu",
memory_type, session_id_); memory_type, session_id_);
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid, memory_type = %u.", memory_type);
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%u, session_id:%lu.",
memory_type, session_id_);
return ge::INTERNAL_ERROR; return ge::INTERNAL_ERROR;
} }


@@ -567,14 +579,15 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen
if (can_not_reuse_old_memory) { if (can_not_reuse_old_memory) {
result = mem_resource->AssignVarMem(var_name, tensor_desc_size, session_id_, mem_offset); result = mem_resource->AssignVarMem(var_name, tensor_desc_size, session_id_, mem_offset);
if (result != SUCCESS) { if (result != SUCCESS) {
GELOGE(ge::INTERNAL_ERROR, "AssignVarMem by offset failed.");
GELOGE(ge::INTERNAL_ERROR, "[Assign][VarMem] by offset failed, session_id:%lu.", session_id_);
return ge::INTERNAL_ERROR; return ge::INTERNAL_ERROR;
} }


result = var_resource_->SaveVarAddr( result = var_resource_->SaveVarAddr(
var_name, tensor_desc, reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(mem_offset)), memory_type); var_name, tensor_desc, reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(mem_offset)), memory_type);
if (result != SUCCESS) { if (result != SUCCESS) {
GELOGE(ge::INTERNAL_ERROR, "AssignVarMem by offset failed.");
GELOGE(ge::INTERNAL_ERROR, "[Save][VarAddr] by offset failed, memory type:%u, session_id:%lu.",
memory_type, session_id_);
return ge::INTERNAL_ERROR; return ge::INTERNAL_ERROR;
} }
} }
@@ -681,7 +694,8 @@ ge::Status VarManager::RenewCurVarDesc(const std::string &var_name, ge::OpDescPt
REPORT_INNER_ERROR("E19999", "VarManager has not been init, op:%s(%s), session_id:%lu, check invalid", REPORT_INNER_ERROR("E19999", "VarManager has not been init, op:%s(%s), session_id:%lu, check invalid",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(),
session_id_); session_id_);
GELOGE(ge::INTERNAL_ERROR, "VarManager has not been init.");
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] VarManager has not been init, op:%s(%s), session_id:%lu",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), session_id_);
return ge::INTERNAL_ERROR; return ge::INTERNAL_ERROR;
} }
return var_resource_->RenewCurVarDesc(var_name, std::move(op_desc)); return var_resource_->RenewCurVarDesc(var_name, std::move(op_desc));
@@ -729,10 +743,8 @@ ge::Status VarManager::MallocVarMemory(size_t memory_size) {
const string purpose("variables and constant op memory in training network."); const string purpose("variables and constant op memory in training network.");
var_mem_base = MemManager::Instance().MemInstance(RT_MEMORY_HBM).MallocMemory(purpose, memory_key, var_memory_size); var_mem_base = MemManager::Instance().MemInstance(RT_MEMORY_HBM).MallocMemory(purpose, memory_key, var_memory_size);
if (var_mem_base == nullptr) { if (var_mem_base == nullptr) {
GELOGE(ge::INTERNAL_ERROR,
"VarManager::MallocVarMemory failed "
"session_id = %s",
memory_key.c_str());
GELOGE(ge::INTERNAL_ERROR, "[Malloc][VarMemory] failed, size:%zu, session_id:%s",
var_memory_size, memory_key.c_str());
return ge::INTERNAL_ERROR; return ge::INTERNAL_ERROR;
} }
return SUCCESS; return SUCCESS;
@@ -812,7 +824,7 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) {
string graph_memory_manager_malloc_max_size = it->second; string graph_memory_manager_malloc_max_size = it->second;
ge::Status ret = ParseMemoryMallocSize(graph_memory_manager_malloc_max_size, graph_mem_max_size_); ge::Status ret = ParseMemoryMallocSize(graph_memory_manager_malloc_max_size, graph_mem_max_size_);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse graph memory manager malloc max size failed.");
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Call][ParseMemoryMallocSize] failed, session id:%lu.", session_id_);
return ge::GE_GRAPH_OPTIONS_INVALID; return ge::GE_GRAPH_OPTIONS_INVALID;
} }
GELOGI("The max size for graph mem is set to %zu", graph_mem_max_size_); GELOGI("The max size for graph mem is set to %zu", graph_mem_max_size_);
@@ -825,7 +837,7 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) {
string memory_var_manager_malloc_size = it->second; string memory_var_manager_malloc_size = it->second;
ge::Status ret = ParseMemoryMallocSize(memory_var_manager_malloc_size, var_mem_max_size_); ge::Status ret = ParseMemoryMallocSize(memory_var_manager_malloc_size, var_mem_max_size_);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse memory var manager malloc size failed.");
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Call][ParseMemoryMallocSize] failed, session id:%lu.", session_id_);
return ge::GE_GRAPH_OPTIONS_INVALID; return ge::GE_GRAPH_OPTIONS_INVALID;
} }
} }
@@ -834,8 +846,8 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) {
if (var_mem_logic_base_ > kMaxMemorySize) { if (var_mem_logic_base_ > kMaxMemorySize) {
REPORT_INNER_ERROR("E19999", "var_login_base:%zu can not exeed limit:%zu, session_id:%lu, check invalid", REPORT_INNER_ERROR("E19999", "var_login_base:%zu can not exeed limit:%zu, session_id:%lu, check invalid",
var_mem_logic_base_, kMaxMemorySize, session_id_); var_mem_logic_base_, kMaxMemorySize, session_id_);
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kMemoryVarLogicBase : %zu can not exceed max memory size : %zu.",
var_mem_logic_base_, kMaxMemorySize);
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Check][Param] kMemoryVarLogicBase:%zu can not exceed "
"max memory size:%zu, session_id:%lu.", var_mem_logic_base_, kMaxMemorySize, session_id_);
return ge::GE_GRAPH_OPTIONS_INVALID; return ge::GE_GRAPH_OPTIONS_INVALID;
} }


@@ -843,8 +855,8 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) {
if (use_max_mem_size_ > kMaxMemorySize) { if (use_max_mem_size_ > kMaxMemorySize) {
REPORT_INNER_ERROR("E19999", "all mem_use size:%zu can not exeed limit:%zu, session_id:%lu, check invalid", REPORT_INNER_ERROR("E19999", "all mem_use size:%zu can not exeed limit:%zu, session_id:%lu, check invalid",
use_max_mem_size_, kMaxMemorySize, session_id_); use_max_mem_size_, kMaxMemorySize, session_id_);
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kUseMaxMemorySize : %zu can not exceed max memory size : %zu.",
use_max_mem_size_, kMaxMemorySize);
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Check][Param] kUseMaxMemorySize:%zu can not exceed "
"max memory size:%zu, session_id:%lu.", use_max_mem_size_, kMaxMemorySize, session_id_);
return ge::GE_GRAPH_OPTIONS_INVALID; return ge::GE_GRAPH_OPTIONS_INVALID;
} }
GELOGI("Set memory malloc size successfully"); GELOGI("Set memory malloc size successfully");
@@ -855,7 +867,7 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) {
if (memory_size.empty()) { if (memory_size.empty()) {
REPORT_INNER_ERROR("E19999", "Param memory_size is empty, session_id:%lu, check invalid", REPORT_INNER_ERROR("E19999", "Param memory_size is empty, session_id:%lu, check invalid",
session_id_); session_id_);
GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input is empty.");
GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] Memory malloc size input is empty, session_id:%lu.", session_id_);
return GE_GRAPH_OPTIONS_INVALID; return GE_GRAPH_OPTIONS_INVALID;
} }
// split string by '*' // split string by '*'
@@ -882,7 +894,9 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) {
if (!isdigit(c)) { if (!isdigit(c)) {
REPORT_INNER_ERROR("E19999", "Param memory_size:%s contains non digit, session_id:%lu, check invalid", REPORT_INNER_ERROR("E19999", "Param memory_size:%s contains non digit, session_id:%lu, check invalid",
memory_size.c_str(), session_id_); memory_size.c_str(), session_id_);
GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input contains non digit.");
GELOGE(GE_GRAPH_OPTIONS_INVALID,
"[Check][Param] Memory malloc size:%s input contains non digit, session_id:%lu.",
memory_size.c_str(), session_id_);
return GE_GRAPH_OPTIONS_INVALID; return GE_GRAPH_OPTIONS_INVALID;
} }
} }
@@ -891,13 +905,15 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) {
REPORT_INNER_ERROR("E19999", "Param memory_size:%s will overflow after multi all, session_id:%lu, " REPORT_INNER_ERROR("E19999", "Param memory_size:%s will overflow after multi all, session_id:%lu, "
"check invalid", memory_size.c_str(), "check invalid", memory_size.c_str(),
session_id_); session_id_);
GELOGE(FAILED, "Input memory size is out of range.");
GELOGE(FAILED, "[Check][Param] Param memory_size:%s will overflow after multi all, session_id:%lu",
memory_size.c_str(), session_id_);
return FAILED); return FAILED);
if ((num > kMaxMemorySize) || (result * static_cast<size_t>(num) > kMaxMemorySize)) { if ((num > kMaxMemorySize) || (result * static_cast<size_t>(num) > kMaxMemorySize)) {
REPORT_INNER_ERROR("E19999", "Param memory_size:%s after multi will exceed limit:%lu, session_id:%lu, " REPORT_INNER_ERROR("E19999", "Param memory_size:%s after multi will exceed limit:%lu, session_id:%lu, "
"check invalid", memory_size.c_str(), kMaxMemorySize, "check invalid", memory_size.c_str(), kMaxMemorySize,
session_id_); session_id_);
GELOGE(FAILED, "Input memory size can not exceed max memory size : %zu.", kMaxMemorySize);
GELOGE(FAILED, "[Check][Param] Input memory size can not exceed max memory size:%zu, session_id:%lu.",
kMaxMemorySize, session_id_);
return FAILED; return FAILED;
} }
result *= static_cast<size_t>(num); result *= static_cast<size_t>(num);
@@ -1001,10 +1017,7 @@ VarManager *VarManagerPool::GetVarManager(uint64_t session_id) {
VarManager *var_manager = new (std::nothrow) VarManager(session_id); VarManager *var_manager = new (std::nothrow) VarManager(session_id);
if (var_manager == nullptr) { if (var_manager == nullptr) {
REPORT_INNER_ERROR("E19999", "New VarManager fail, session_id:%lu", session_id); REPORT_INNER_ERROR("E19999", "New VarManager fail, session_id:%lu", session_id);
GELOGE(INTERNAL_ERROR,
"VarManager::Instance find session by "
"session_id[%lu] failed.",
session_id);
GELOGE(INTERNAL_ERROR, "[New][VarManager] fail, session_id:%lu", session_id);
static VarManager new_var_manager(0); static VarManager new_var_manager(0);
return &new_var_manager; return &new_var_manager;
} }


+ 4
- 4
ge/graph/manager/host_mem_allocator.cc View File

@@ -34,8 +34,8 @@ uint8_t *HostMemAllocator::Malloc(size_t size) {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
std::shared_ptr<AlignedPtr> aligned_ptr = MakeShared<AlignedPtr>(size); std::shared_ptr<AlignedPtr> aligned_ptr = MakeShared<AlignedPtr>(size);
if (aligned_ptr == nullptr) { if (aligned_ptr == nullptr) {
REPORT_INNER_ERROR("E19999", "New AlignedPtr fail");
GELOGE(INTERNAL_ERROR, "make shared_ptr for AlignedPtr failed");
REPORT_INNER_ERROR("E19999", "New AlignedPtr fail, size:%zu", size);
GELOGE(INTERNAL_ERROR, "[Call][MakeShared] for AlignedPtr failed, size:%zu", size);
return nullptr; return nullptr;
} }
allocated_blocks_[aligned_ptr->Get()] = { size, aligned_ptr }; allocated_blocks_[aligned_ptr->Get()] = { size, aligned_ptr };
@@ -46,7 +46,7 @@ uint8_t *HostMemAllocator::Malloc(size_t size) {
Status HostMemAllocator::Free(const void *memory_addr) { Status HostMemAllocator::Free(const void *memory_addr) {
if (memory_addr == nullptr) { if (memory_addr == nullptr) {
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, check invalid"); REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, check invalid");
GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer");
GELOGE(GE_GRAPH_FREE_FAILED, "[Check][Param] Invalid memory pointer");
return GE_GRAPH_FREE_FAILED; return GE_GRAPH_FREE_FAILED;
} }


@@ -54,7 +54,7 @@ Status HostMemAllocator::Free(const void *memory_addr) {
auto it = allocated_blocks_.find(memory_addr); auto it = allocated_blocks_.find(memory_addr);
if (it == allocated_blocks_.end()) { if (it == allocated_blocks_.end()) {
REPORT_INNER_ERROR("E19999", "Memory_addr is not alloc before, check invalid"); REPORT_INNER_ERROR("E19999", "Memory_addr is not alloc before, check invalid");
GELOGE(PARAM_INVALID, "Invalid memory pointer");
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer:%p", memory_addr);
return PARAM_INVALID; return PARAM_INVALID;
} }
it->second.second.reset(); it->second.second.reset();


+ 9
- 13
ge/graph/manager/host_mem_manager.cc View File

@@ -39,9 +39,8 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) {
rtMallocHostSharedMemoryOut output_para; rtMallocHostSharedMemoryOut output_para;
rtError_t rt_ret = rtMallocHostSharedMemory(&input_para, &output_para); rtError_t rt_ret = rtMallocHostSharedMemory(&input_para, &output_para);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X",
rt_ret);
GELOGE(RT_FAILED, "Call rt api(rtMallocHostSharedMemory) failed, devid:[%u].", device_id);
REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtMallocHostSharedMemory] failed, devid:[%u].", device_id);
return GE_GRAPH_MEMORY_ALLOC_FAILED; return GE_GRAPH_MEMORY_ALLOC_FAILED;
} }
mem_info.fd = output_para.fd; mem_info.fd = output_para.fd;
@@ -60,9 +59,8 @@ Status SharedMemAllocator::DeAllocate(SharedMemInfo &mem_info) {
mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address}; mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address};
rtError_t rt_ret = rtFreeHostSharedMemory(&free_para); rtError_t rt_ret = rtFreeHostSharedMemory(&free_para);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X",
rt_ret);
GELOGE(RT_FAILED, "Call rt api(rtFreeHostSharedMemory) failed, ret: 0x%X.", rt_ret);
REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X", rt_ret);
GELOGE(RT_FAILED, "[Call][RtFreeHostSharedMemory] failed, ret:0x%X.", rt_ret);
return RT_FAILED; return RT_FAILED;
} }
return ge::SUCCESS; return ge::SUCCESS;
@@ -78,7 +76,7 @@ Status HostMemManager::Initialize() {
allocator_ = std::unique_ptr<SharedMemAllocator>(new (std::nothrow) SharedMemAllocator()); allocator_ = std::unique_ptr<SharedMemAllocator>(new (std::nothrow) SharedMemAllocator());
if (allocator_ == nullptr) { if (allocator_ == nullptr) {
REPORT_CALL_ERROR("E19999", "New SharedMemAllocator fail"); REPORT_CALL_ERROR("E19999", "New SharedMemAllocator fail");
GELOGE(GE_GRAPH_MALLOC_FAILED, "Shared memory allocator init failed!");
GELOGE(GE_GRAPH_MALLOC_FAILED, "[New][SharedMemAllocator] failed!");
return GE_GRAPH_MALLOC_FAILED; return GE_GRAPH_MALLOC_FAILED;
} }
return SUCCESS; return SUCCESS;
@@ -98,9 +96,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) {
std::lock_guard<std::recursive_mutex> lock(mutex_); std::lock_guard<std::recursive_mutex> lock(mutex_);
auto iter = var_memory_base_map_.find(mem_info.op_name); auto iter = var_memory_base_map_.find(mem_info.op_name);
if (iter != var_memory_base_map_.end()) { if (iter != var_memory_base_map_.end()) {
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_",
mem_info.op_name.c_str());
GELOGE(FAILED, "Host shared memory for op %s has been malloced", mem_info.op_name.c_str());
REPORT_INNER_ERROR("E19999", "Host shared memory for op %s has been malloced", mem_info.op_name.c_str());
GELOGE(FAILED, "[Check][Param] Host shared memory for op %s has been malloced", mem_info.op_name.c_str());
return FAILED; return FAILED;
} }
mem_info.shm_name = OpNameToShmName(mem_info.op_name); mem_info.shm_name = OpNameToShmName(mem_info.op_name);
@@ -113,9 +110,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) {
Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size) { Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size) {
std::lock_guard<std::recursive_mutex> lock(mutex_); std::lock_guard<std::recursive_mutex> lock(mutex_);
if (var_memory_base_map_.find(op_name) == var_memory_base_map_.end()) { if (var_memory_base_map_.find(op_name) == var_memory_base_map_.end()) {
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_",
op_name.c_str());
GELOGE(INTERNAL_ERROR, "Find host base base_addr failed,node name:%s!", op_name.c_str());
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", op_name.c_str());
GELOGE(INTERNAL_ERROR, "[Check][Param] Find host base base_addr failed, node name:%s!", op_name.c_str());
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address)); base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address));


+ 8
- 10
ge/graph/manager/memory_api.cc View File

@@ -50,9 +50,8 @@ Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t
path.append(file_name); path.append(file_name);
string canonical_path = RealPath(path.c_str()); string canonical_path = RealPath(path.c_str());
if (canonical_path.empty()) { if (canonical_path.empty()) {
REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid",
canonical_path.c_str());
GELOGE(FAILED, "Failed to get realpath of %s", path.c_str());
REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid", canonical_path.c_str());
GELOGE(FAILED, "[Call][RealPath] Failed to get realpath of %s", path.c_str());
return FAILED; return FAILED;
} }
GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str());
@@ -69,15 +68,14 @@ Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t
if (hcom_remote_mem_register == nullptr) { if (hcom_remote_mem_register == nullptr) {
REPORT_CALL_ERROR("E19999", "Symbol HcomRegRemoteAccessMem can't find in %s, check invalid", REPORT_CALL_ERROR("E19999", "Symbol HcomRegRemoteAccessMem can't find in %s, check invalid",
canonical_path.c_str()); canonical_path.c_str());
GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function.");
GELOGE(FAILED, "[Check][Param] Symbol HcomRegRemoteAccessMem can't find in %s", canonical_path.c_str());
return FAILED; return FAILED;
} }


HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len); HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len);
if (hccl_ret != HCCL_SUCCESS) { if (hccl_ret != HCCL_SUCCESS) {
REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,",
hccl_ret);
GELOGE(HCCL_E_INTERNAL, "Rdma mem register failed, ret: 0x%X", hccl_ret);
REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,", hccl_ret);
GELOGE(HCCL_E_INTERNAL, "[Call][HcomRemoteMemRegister] Rdma mem register failed, ret:0x%X", hccl_ret);
return HCCL_E_INTERNAL; return HCCL_E_INTERNAL;
} }
return SUCCESS; return SUCCESS;
@@ -88,14 +86,14 @@ Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uin
uint32_t type_size = 0; uint32_t type_size = 0;
bool result = TypeUtils::GetDataTypeLength(tensor_info.data_type, type_size); bool result = TypeUtils::GetDataTypeLength(tensor_info.data_type, type_size);
if (!result) { if (!result) {
GELOGE(GRAPH_FAILED, "GetDataTypeLength failed, data_type=(%s).",
GELOGE(GRAPH_FAILED, "[Get][DataTypeLength] failed, data_type=(%s).",
TypeUtils::DataTypeToSerialString(tensor_info.data_type).c_str()); TypeUtils::DataTypeToSerialString(tensor_info.data_type).c_str());
return GRAPH_FAILED; return GRAPH_FAILED;
} }
memory_size = type_size; memory_size = type_size;
for (auto dim : tensor_info.dims) { for (auto dim : tensor_info.dims) {
if (dim <= 0) { if (dim <= 0) {
GELOGE(GRAPH_FAILED, "Tensor dims should be positive");
GELOGE(GRAPH_FAILED, "[Check][Param] Tensor dims should be positive");
return GRAPH_FAILED; return GRAPH_FAILED;
} }
memory_size *= dim; memory_size *= dim;
@@ -103,7 +101,7 @@ Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uin
SharedMemInfo mem_info(tensor_info.var_name, memory_size); SharedMemInfo mem_info(tensor_info.var_name, memory_size);
Status ret = HostMemManager::Instance().MallocSharedMemory(mem_info); Status ret = HostMemManager::Instance().MallocSharedMemory(mem_info);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(GRAPH_FAILED, "MallocSharedMemory failed op name [%s]", tensor_info.var_name.c_str());
GELOGE(GRAPH_FAILED, "[Malloc][SharedMemory] failed, op name [%s]", tensor_info.var_name.c_str());
return GRAPH_FAILED; return GRAPH_FAILED;
} }
dev_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(mem_info.device_address)); dev_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(mem_info.device_address));


+ 1
- 1
ge/graph/manager/model_manager/event_manager.cc View File

@@ -45,7 +45,7 @@ Status EventManager::Init(size_t event_num) {
void EventManager::Release() noexcept { void EventManager::Release() noexcept {
for (size_t i = 0; i < this->event_list_.size(); ++i) { for (size_t i = 0; i < this->event_list_.size(); ++i) {
rtError_t rt_ret = rtEventDestroy(this->event_list_[i]); rtError_t rt_ret = rtEventDestroy(this->event_list_[i]);
RETURN_IF_COND_NOT_MET(rt_ret == RT_ERROR_NONE, "Destroy event failed, idx is %zu, ret is 0x%x.", i, rt_ret);
RETURN_IF_COND_NOT_MET(rt_ret == RT_ERROR_NONE, "[Destroy][Event] failed, idx is %zu, ret is 0x%x.", i, rt_ret);
} }
this->event_list_.clear(); this->event_list_.clear();




+ 10
- 11
ge/graph/manager/rdma_pool_allocator.cc View File

@@ -82,8 +82,8 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) {
auto device_id = GetContext().DeviceId(); auto device_id = GetContext().DeviceId();
GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id); GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id);
if (rdma_base_addr_ != nullptr) { if (rdma_base_addr_ != nullptr) {
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid");
GELOGE(GE_MULTI_INIT, "Rdma pool has been malloced");
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is not nullptr, devid:%u, check invalid", device_id);
GELOGE(GE_MULTI_INIT, "[Check][Param] Rdma pool has been malloced, devid:%u", device_id);
return GE_MULTI_INIT; return GE_MULTI_INIT;
} }
const std::string purpose = "Memory for rdma pool."; const std::string purpose = "Memory for rdma pool.";
@@ -95,15 +95,15 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) {


rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id); rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id);
if (rdma_base_addr_ == nullptr) { if (rdma_base_addr_ == nullptr) {
GELOGE(GE_GRAPH_MALLOC_FAILED, "Rdma pool memory malloc failed");
GELOGE(GE_GRAPH_MALLOC_FAILED, "[Malloc][Memory] failed, size:%zu, device_id:%u", mem_size, device_id);
return GE_GRAPH_MALLOC_FAILED; return GE_GRAPH_MALLOC_FAILED;
} }
rdma_mem_size_ = mem_size; rdma_mem_size_ = mem_size;
// Init with a base block. // Init with a base block.
auto *base_block = new (std::nothrow) Block(device_id, mem_size, rdma_base_addr_); auto *base_block = new (std::nothrow) Block(device_id, mem_size, rdma_base_addr_);
if (base_block == nullptr) { if (base_block == nullptr) {
REPORT_CALL_ERROR("E19999", "New Block failed, device_id:%u", device_id);
GELOGE(GE_GRAPH_MALLOC_FAILED, "Block malloc failed");
REPORT_CALL_ERROR("E19999", "New Block failed, size:%zu, device_id:%u", mem_size, device_id);
GELOGE(GE_GRAPH_MALLOC_FAILED, "[New][Block] failed, size:%zu, device_id:%u", mem_size, device_id);
return GE_GRAPH_MALLOC_FAILED; return GE_GRAPH_MALLOC_FAILED;
} }
block_bin_.insert(base_block); block_bin_.insert(base_block);
@@ -123,7 +123,7 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) {
if (block->ptr == nullptr) { if (block->ptr == nullptr) {
REPORT_INNER_ERROR("E19999", "Rdmapool memory address is nullptr, device_id:%u, check invalid", REPORT_INNER_ERROR("E19999", "Rdmapool memory address is nullptr, device_id:%u, check invalid",
device_id); device_id);
GELOGE(INTERNAL_ERROR, "Rdmapool memory address is nullptr.");
GELOGE(INTERNAL_ERROR, "[Check][Param] Rdmapool memory address is nullptr, device_id:%u", device_id);
return nullptr; return nullptr;
} }
allocated_blocks_.emplace(block->ptr, block); allocated_blocks_.emplace(block->ptr, block);
@@ -155,9 +155,8 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) {
Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) {
GELOGI("Free rdma memory, device id = %u", device_id); GELOGI("Free rdma memory, device id = %u", device_id);
if (memory_addr == nullptr) { if (memory_addr == nullptr) {
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid",
device_id);
GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer");
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid", device_id);
GELOGE(GE_GRAPH_FREE_FAILED, "[Check][Param] Invalid memory pointer, device id:%u", device_id);
return GE_GRAPH_FREE_FAILED; return GE_GRAPH_FREE_FAILED;
} }


@@ -166,7 +165,7 @@ Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) {
if (it == allocated_blocks_.end()) { if (it == allocated_blocks_.end()) {
REPORT_INNER_ERROR("E19999", "Param memory_addr is not allocated before, device_id:%u, " REPORT_INNER_ERROR("E19999", "Param memory_addr is not allocated before, device_id:%u, "
"check invalid", device_id); "check invalid", device_id);
GELOGE(PARAM_INVALID, "Invalid memory pointer");
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer, device id:%u", device_id);
return PARAM_INVALID; return PARAM_INVALID;
} }


@@ -209,7 +208,7 @@ void RdmaPoolAllocator::MergeBlocks(Block *dst, Block *src) {
Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) { Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) {
if (rdma_base_addr_ == nullptr) { if (rdma_base_addr_ == nullptr) {
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid");
GELOGE(INTERNAL_ERROR, "Rdma base addr is nullptr.");
GELOGE(INTERNAL_ERROR, "[Check][Param] Rdma base addr is nullptr.");
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(rdma_base_addr_)); base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(rdma_base_addr_));


+ 75
- 57
ge/graph/manager/trans_var_data_utils.cc View File

@@ -37,7 +37,8 @@ class RtContextSwitchGuard {
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, device_id:%u, ret:0x%X,", REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, device_id:%u, ret:0x%X,",
device_id, ret); device_id, ret);
GELOGE(RT_FAILED, "Failed to get current context from rt, error-code %d", ret);
GELOGE(RT_FAILED, "[Call][RtCtxGetCurrent] Failed to get current context, device_id:%u, ret:0x%X",
device_id, ret);
return; return;
} }


@@ -45,15 +46,14 @@ class RtContextSwitchGuard {
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtCtxCreate failed, device_id:%u, ret:0x%X,", REPORT_CALL_ERROR("E19999", "Call rtCtxCreate failed, device_id:%u, ret:0x%X,",
device_id, ret); device_id, ret);
GELOGE(RT_FAILED, "Failed to create new context for device %u, error-code %d", device_id, ret);
GELOGE(RT_FAILED, "[Call][RtCtxCreate] Failed to create new context for device:%u, ret:%d", device_id, ret);
return; return;
} }


ret = rtCtxSetCurrent(current_); ret = rtCtxSetCurrent(current_);
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X,",
device_id, ret);
GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id);
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X", device_id, ret);
GELOGE(RT_FAILED, "[Call][RtCtxSetCurrent] failed, device_id:%u, ret:0x%X", device_id, ret);
return; return;
} }
GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_); GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_);
@@ -80,7 +80,7 @@ int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) {
if (var_size <= 0) { if (var_size <= 0) {
REPORT_INNER_ERROR("E19999", "Data type:%s in desc, it's size:%ld < 0, check invalid", REPORT_INNER_ERROR("E19999", "Data type:%s in desc, it's size:%ld < 0, check invalid",
TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str(), var_size); TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str(), var_size);
GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s",
GELOGE(PARAM_INVALID, "[Calc][VarDataSize] by data type %s failed.",
TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str()); TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str());
return -1; return -1;
} }
@@ -99,7 +99,8 @@ Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_res
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(),
var->GetType().c_str(), trans_result.length, ret); var->GetType().c_str(), trans_result.length, ret);
GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", trans_result.length);
GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(),
var->GetType().c_str(), trans_result.length, ret);
return RT_FAILED; return RT_FAILED;
} }
return SUCCESS; return SUCCESS;
@@ -111,21 +112,17 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt
GE_CHECK_NOTNULL(var); GE_CHECK_NOTNULL(var);
auto ret = VarManager::Instance(session_id)->GetVarAddr(var->GetName(), input_desc, &var_logic); auto ret = VarManager::Instance(session_id)->GetVarAddr(var->GetName(), input_desc, &var_logic);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(INTERNAL_ERROR,
"Failed to copy var %s from device, can not find it"
" from var manager %u",
var->GetName().c_str(), ret);
GELOGE(INTERNAL_ERROR, "[Get][VarAddr] failed, node:%s, session_id:%lu, ret:%d",
var->GetName().c_str(), session_id, ret);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }


uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM);
if (var_addr == nullptr) { if (var_addr == nullptr) {
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu,",
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu",
RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id);
GELOGE(INTERNAL_ERROR,
"Failed to copy var %s from device, cant not get "
"var addr from logic addr %p",
var->GetName().c_str(), var_logic);
GELOGE(INTERNAL_ERROR, "[Get][VarMemoryAddr] failed, mem_type:%d, op:%s(%s), session_id:%lu",
RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }


@@ -136,9 +133,10 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt


std::unique_ptr<uint8_t[]> var_host(new(std::nothrow) uint8_t[var_size_bytes]); std::unique_ptr<uint8_t[]> var_host(new(std::nothrow) uint8_t[var_size_bytes]);
if (var_host == nullptr) { if (var_host == nullptr) {
REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu,",
REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu",
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id);
GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %ld", var_size_bytes);
GELOGE(OUT_OF_MEMORY, "[New][Memory] for rt-host failed, size:%ld, op:%s(%s), session_id:%lu",
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id);
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }


@@ -147,10 +145,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X",
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret);
GELOGE(RT_FAILED,
"Failed to copy var memory from device, var %s, size %ld,"
" rt-error-code %u",
var->GetName().c_str(), var_size_bytes, ret);
GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X",
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret);
return RT_FAILED; return RT_FAILED;
} }


@@ -197,9 +193,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats
formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(src_shape).c_str(),
formats::ShapeToString(dst_shape).c_str(), formats::ShapeToString(dst_shape).c_str(),
TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); TypeUtils::DataTypeToSerialString(data_type).c_str(), ret);
GELOGE(INTERNAL_ERROR,
"Failed to trans format from %s to %s, shape %s to %s, "
"data type %s error code %u",
GELOGE(INTERNAL_ERROR, "[Trans][Format] from %s to %s, shape %s to %s failed, data type %s error code %u",
TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(),
formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(), formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(),
TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); TypeUtils::DataTypeToSerialString(data_type).c_str(), ret);
@@ -221,7 +215,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats
TypeUtils::DataTypeToSerialString(src_data_type).c_str(), TypeUtils::DataTypeToSerialString(src_data_type).c_str(),
TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), TypeUtils::DataTypeToSerialString(dst_data_type).c_str(),
formats::ShapeToString(input_shape).c_str(), src_data_size, ret); formats::ShapeToString(input_shape).c_str(), src_data_size, ret);
GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %ld, error code %u",
GELOGE(INTERNAL_ERROR, "[Trans][DataType] from %s to %s failed, input shape %s, data size %ld, error code %u",
TypeUtils::DataTypeToSerialString(src_data_type).c_str(), TypeUtils::DataTypeToSerialString(src_data_type).c_str(),
TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(),
src_data_size, ret); src_data_size, ret);
@@ -230,7 +224,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats
} else { } else {
REPORT_INNER_ERROR("E19999", "Trans var data failed, the trans type %s does not supported, check invalid", REPORT_INNER_ERROR("E19999", "Trans var data failed, the trans type %s does not supported, check invalid",
trans_info.node_type.c_str()); trans_info.node_type.c_str());
GELOGE(UNSUPPORTED, "Failed to trans var data, the trans type %s does not supported",
GELOGE(UNSUPPORTED, "[Trans][VarData] failed, the trans type %s does not supported",
trans_info.node_type.c_str()); trans_info.node_type.c_str());
return UNSUPPORTED; return UNSUPPORTED;
} }
@@ -255,10 +249,8 @@ Status ReAssignVarAddr(uint64_t session_id,
uint8_t *var_logic = nullptr; uint8_t *var_logic = nullptr;
Status ret = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &var_logic); Status ret = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &var_logic);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(INTERNAL_ERROR,
"Failed to get var %s device addr, can not find it"
" from var manager %u",
var_name.c_str(), ret);
GELOGE(INTERNAL_ERROR, "[Get][VarAddr] failed, var name:%s, session_id:%lu, ret:%u",
var_name.c_str(), session_id, ret);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }


@@ -266,7 +258,8 @@ Status ReAssignVarAddr(uint64_t session_id,
if (var_addr == nullptr) { if (var_addr == nullptr) {
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, var_name:%s, session_id:%lu,", REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, var_name:%s, session_id:%lu,",
RT_MEMORY_HBM, var_name.c_str(), session_id); RT_MEMORY_HBM, var_name.c_str(), session_id);
GELOGE(INTERNAL_ERROR, "Failed to convert var %s logic addr to real addr", var_name.c_str());
GELOGE(INTERNAL_ERROR, "[Get][VarMemoryAddr] failed, mem_type:%d, var_name:%s, session_id:%lu",
RT_MEMORY_HBM, var_name.c_str(), session_id);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
*var_device = var_addr; *var_device = var_addr;
@@ -293,9 +286,8 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t
// Sync var data from device // Sync var data from device
std::unique_ptr<uint8_t[]> var_data; std::unique_ptr<uint8_t[]> var_data;
if (trans_road.empty()) { if (trans_road.empty()) {
REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid",
session_id);
GELOGE(INTERNAL_ERROR, "Failed to get trans_road, trans_road is empty.");
REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid", session_id);
GELOGE(INTERNAL_ERROR, "[Check][Param] trans_road is empty, session_id:%lu", session_id);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
const GeTensorDesc &input_desc = trans_road.begin()->input; const GeTensorDesc &input_desc = trans_road.begin()->input;
@@ -307,7 +299,7 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t
formats::TransResult trans_result{}; formats::TransResult trans_result{};
ret = TransVarOnHost(var_data.get(), trans_road, trans_result); ret = TransVarOnHost(var_data.get(), trans_road, trans_result);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "Failed to trans var data on host, error code %u", ret);
GELOGE(ret, "[Call][TransVarOnHost] failed, session_id:%lu, ret:%u", session_id, ret);
return ret; return ret;
} }


@@ -319,14 +311,15 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t
/// TensorDesc needs to be removed. This change is large and needs to be performed step by step. /// TensorDesc needs to be removed. This change is large and needs to be performed step by step.
ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device); ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "Failed to re-assign memory on device, size %zu", trans_result.length);
GELOGE(ret, "[Call][ReAssignVarAddr] failed, session id:%lu, op:%s, ret:%u",
session_id, var->GetName().c_str(), ret);
return ret; return ret;
} }


// sync new data to device // sync new data to device
ret = CopyVarToDevice(var, trans_result, var_device); ret = CopyVarToDevice(var, trans_result, var_device);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "Failed to send var data to device");
GELOGE(ret, "[Call][CopyVarToDevice] failed, var:%s, ret:%u", var->GetName().c_str(), ret);
return ret; return ret;
} }


@@ -350,7 +343,10 @@ Status TransTensor(uint8_t *var_data, const NodePtr &var_src, const NodePtr &var
TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(),
TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(),
src_data_shape_size, ret); src_data_shape_size, ret);
GELOGE(INTERNAL_ERROR, "trans var data on host failed");
GELOGE(INTERNAL_ERROR, "[Trans][DataType] from %s to %s failed, data size %ld, ret:%u",
TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(),
TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(),
src_data_shape_size, ret);
return ret; return ret;
}); });
return SUCCESS; return SUCCESS;
@@ -366,9 +362,11 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src,
/// need copy value from var_fp32 to var_fp16. /// need copy value from var_fp32 to var_fp16.
/// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr] /// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr]
GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr,
REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is empty, session_id:%lu, device_id:%u, "
REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is nullptr, session_id:%lu, device_id:%u, "
"check invalid", session_id, device_id); "check invalid", session_id, device_id);
GELOGE(FAILED, "node var is nullptr"); return FAILED);
GELOGE(FAILED, "[Check][Param] Param var_src or var_dst is nullptr, session_id:%lu, device_id:%u",
session_id, device_id);
return FAILED);
// src_node output_desc (fp32) // src_node output_desc (fp32)
GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0); GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0);
auto src_data_type = output_desc.GetDataType(); auto src_data_type = output_desc.GetDataType();
@@ -390,31 +388,45 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src,
RtContextSwitchGuard switch_context(RT_CTX_NORMAL_MODE, device_id); RtContextSwitchGuard switch_context(RT_CTX_NORMAL_MODE, device_id);
// copy from src_node // copy from src_node
auto ret = CopyVarFromDevice(session_id, var_src, var_src_data, output_desc); auto ret = CopyVarFromDevice(session_id, var_src, var_src_data, output_desc);
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "Copy Var From Device failed"); return ret);
GE_IF_BOOL_EXEC(ret != SUCCESS,
GELOGE(FAILED, "[Call][CopyVarFromDevice] failed, session id:%lu, var_src:%s",
session_id, var_src->GetName().c_str());
return ret);
// trans dtype // trans dtype
formats::TransResult trans_result{}; formats::TransResult trans_result{};
ret = TransTensor(var_src_data.get(), var_src, var_dst, trans_result); ret = TransTensor(var_src_data.get(), var_src, var_dst, trans_result);
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "trans var data on host failed"); return ret);
GE_IF_BOOL_EXEC(ret != SUCCESS,
GELOGE(INTERNAL_ERROR, "[Trans][Tensor] failed, var_src:%s, var_dst:%s",
var_src->GetName().c_str(), var_dst->GetName().c_str());
return ret);
// reset src value. // reset src value.
void *var_device = nullptr; void *var_device = nullptr;
ret = ReAssignVarAddr(session_id, var_dst->GetName(), dst_tensor_desc, &var_device); ret = ReAssignVarAddr(session_id, var_dst->GetName(), dst_tensor_desc, &var_device);
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "assign mem failed"); return ret);
GE_IF_BOOL_EXEC(ret != SUCCESS,
GELOGE(INTERNAL_ERROR, "[Call][ReAssignVarAddr] failed, session id:%lu, var_dst:%s",
session_id, var_dst->GetName().c_str());
return ret);
// copy to device // copy to device
ret = CopyVarToDevice(var_dst, trans_result, var_device); ret = CopyVarToDevice(var_dst, trans_result, var_device);
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Failed to send var data to device"); return ret);
GE_IF_BOOL_EXEC(ret != SUCCESS,
GELOGE(ret, "[Call][CopyVarToDevice] failed, var_dst:%s, ret:%u",
var_dst->GetName().c_str(), ret);
return ret);
return SUCCESS; return SUCCESS;
} }
} // namespace } // namespace
Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc,
uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id) { uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id) {
GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "dst addr is null. ");
GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "[Check][Param] dst addr is nullptr.");
uint8_t *src_host_addr = nullptr; uint8_t *src_host_addr = nullptr;
int64_t src_addr_size = 0; int64_t src_addr_size = 0;
GE_MAKE_GUARD_RTMEM(src_host_addr); GE_MAKE_GUARD_RTMEM(src_host_addr);
GE_CHK_STATUS_RET(SyncTensorToHost(var_name, src_tensor_desc, &src_host_addr, src_addr_size, session_id)); GE_CHK_STATUS_RET(SyncTensorToHost(var_name, src_tensor_desc, &src_host_addr, src_addr_size, session_id));


GELOGI("src_addr_size: %ld, dst_addr_size: %ld", src_addr_size, dst_addr_size); GELOGI("src_addr_size: %ld, dst_addr_size: %ld", src_addr_size, dst_addr_size);
GE_CHK_BOOL_RET_STATUS(src_addr_size == dst_addr_size, FAILED, "var data size is not equal broadcast ");
GE_CHK_BOOL_RET_STATUS(src_addr_size == dst_addr_size, FAILED,
"[Check][Param] src_addr_size:%ld not equal to dst_addr_size:%ld",
src_addr_size, dst_addr_size);


GE_CHK_RT_RET(rtMemcpy(dst_addr, dst_addr_size, src_host_addr, src_addr_size, RT_MEMCPY_HOST_TO_DEVICE)); GE_CHK_RT_RET(rtMemcpy(dst_addr, dst_addr_size, src_host_addr, src_addr_size, RT_MEMCPY_HOST_TO_DEVICE));
return SUCCESS; return SUCCESS;
@@ -422,7 +434,7 @@ Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge


Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name, Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name,
const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) { const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) {
GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "src addr is null. ");
GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "[Check][Param] src addr is nullptr. ");
uint8_t *host_addr = nullptr; uint8_t *host_addr = nullptr;
GE_MAKE_GUARD_RTMEM(host_addr); GE_MAKE_GUARD_RTMEM(host_addr);
GE_CHK_RT_RET(rtMallocHost(reinterpret_cast<void **>(&host_addr), src_addr_size)); GE_CHK_RT_RET(rtMallocHost(reinterpret_cast<void **>(&host_addr), src_addr_size));
@@ -436,7 +448,7 @@ Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_a


Status TransVarDataUtils::SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, Status TransVarDataUtils::SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc,
uint8_t **host_addr, int64_t &src_tensor_size, uint64_t session_id) { uint8_t **host_addr, int64_t &src_tensor_size, uint64_t session_id) {
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "get size from TensorDesc failed");
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "[Get][Size] from TensorDesc failed");


uint8_t *src_addr = nullptr; uint8_t *src_addr = nullptr;
GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, src_tensor_desc, &src_addr)); GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, src_tensor_desc, &src_addr));
@@ -493,7 +505,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes,
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, ret:0x%X,", REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, ret:0x%X,",
session_id, graph_id, rt_ret); session_id, graph_id, rt_ret);
GELOGE(RT_FAILED, "Failed to set context, error_code is: 0x%X.", rt_ret);
GELOGE(RT_FAILED, "[Call][RtCtxSetCurrent] failed, session_id:%lu, graph_id:%u, ret:0x%X,",
session_id, graph_id, rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);
} }
uint32_t allocated_graph_id = 0; uint32_t allocated_graph_id = 0;
@@ -501,8 +514,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes,
if (ret != SUCCESS) { if (ret != SUCCESS) {
REPORT_CALL_ERROR("E19999", "Get allocated GraphId failed, session_id:%lu, graph_id:%u, ret:0x%X,", REPORT_CALL_ERROR("E19999", "Get allocated GraphId failed, session_id:%lu, graph_id:%u, ret:0x%X,",
session_id, graph_id, ret); session_id, graph_id, ret);
GELOGE(INTERNAL_ERROR, "var has not been allocated, node:%s, graph_id:%u.", node->GetName().c_str(),
graph_id);
GELOGE(INTERNAL_ERROR, "[Get][AllocatedGraphId] failed, node:%s, graph_id:%u.",
node->GetName().c_str(), graph_id);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
uint32_t changed_graph_id = 0; uint32_t changed_graph_id = 0;
@@ -518,7 +531,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes,
} }
ret = TransVarData(node, *trans_road, session_id); ret = TransVarData(node, *trans_road, session_id);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(INTERNAL_ERROR, "TransVarData failed, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id);
GELOGE(INTERNAL_ERROR, "[Trans][VarData] failed, node:%s, graph_id:%u, session_id:%lu.",
node->GetName().c_str(), graph_id, session_id);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
VarManager::Instance(session_id)->RemoveChangedGraphId(node->GetName()); VarManager::Instance(session_id)->RemoveChangedGraphId(node->GetName());
@@ -527,7 +541,7 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes,
}, },
node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorManagerContext()); node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorManagerContext());
if (!f.valid()) { if (!f.valid()) {
GELOGE(FAILED, "Future is invalid");
GELOGE(FAILED, "[Check][Param] Future is invalid, session id:%lu, graph id:%u", session_id, graph_id);
return FAILED; return FAILED;
} }
vector_future.push_back(std::move(f)); vector_future.push_back(std::move(f));
@@ -537,7 +551,7 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes,
for (size_t i = 0; i < vector_future.size(); ++i) { for (size_t i = 0; i < vector_future.size(); ++i) {
ret_status = vector_future[i].get(); ret_status = vector_future[i].get();
if (ret_status != SUCCESS) { if (ret_status != SUCCESS) {
GELOGE(ret_status, "TransAllVarData:: trans %zu vardata failed", i);
GELOGE(ret_status, "[Check][Param] trans %zu vardata failed", i);
return ret_status; return ret_status;
} }
} }
@@ -550,7 +564,8 @@ Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint
if (compute_graph == nullptr) { if (compute_graph == nullptr) {
REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, session_id:%lu, device_id:%u, check invalid", REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, session_id:%lu, device_id:%u, check invalid",
session_id, device_id); session_id, device_id);
GELOGE(FAILED, "compute_graph is nullptr");
GELOGE(FAILED, "[Check][Param] compute_graph is nullptr, session_id:%lu, device_id:%u",
session_id, device_id);
return FAILED; return FAILED;
} }


@@ -568,7 +583,10 @@ Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint
GELOGI("current_var_node__: [%s] copy_from_var_node__: [%s].", node->GetName().c_str(), GELOGI("current_var_node__: [%s] copy_from_var_node__: [%s].", node->GetName().c_str(),
src_node->GetName().c_str()); src_node->GetName().c_str());
auto ret = CopyTensorFromSrcVarNode(src_node, node, session_id, device_id); auto ret = CopyTensorFromSrcVarNode(src_node, node, session_id, device_id);
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "copy tensor failed!"); return FAILED);
GE_IF_BOOL_EXEC(ret != SUCCESS,
GELOGE(FAILED, "[Copy][Tensor] failed, src_node:%s, node:%s, session_id:%lu, device_id:%u",
src_node->GetName().c_str(), node->GetName().c_str(), session_id, device_id);
return FAILED);
// only copy once // only copy once
(void) ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true); // no need to check value (void) ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true); // no need to check value
} }


+ 4
- 6
ge/graph/manager/util/debug.cc View File

@@ -63,17 +63,15 @@ Status Debug::DumpDevMem(const char *file, const void *addr, int64_t size) {
uint8_t *host_addr = nullptr; uint8_t *host_addr = nullptr;
rtError_t ret = rtMallocHost(reinterpret_cast<void **>(&host_addr), size); rtError_t ret = rtMallocHost(reinterpret_cast<void **>(&host_addr), size);
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret: 0x%X",
size, ret);
GELOGE(FAILED, "Call rt api rtMallocHost failed, ret: 0x%X", ret);
REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret:0x%X", size, ret);
GELOGE(FAILED, "[Call][RtMallocHost] failed, size:%zu, ret:0x%X", size, ret);
return FAILED; return FAILED;
} }
GE_MAKE_GUARD_RTMEM(host_addr); GE_MAKE_GUARD_RTMEM(host_addr);
ret = rtMemcpy(host_addr, size, addr, size, RT_MEMCPY_DEVICE_TO_HOST); ret = rtMemcpy(host_addr, size, addr, size, RT_MEMCPY_DEVICE_TO_HOST);
if (ret != RT_ERROR_NONE) { if (ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X",
size, ret);
GELOGE(FAILED, "Call rt api rtMemcpy failed, ret: 0x%X", ret);
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", size, ret);
GELOGE(FAILED, "[Call][RtMemcpy] failed, size:%zu, ret:0x%X", size, ret);
return FAILED; return FAILED;
} }




+ 64
- 51
ge/graph/manager/util/hcom_util.cc View File

@@ -28,7 +28,8 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc,
std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) { std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
GE_CHECK_NOTNULL(op_desc); GE_CHECK_NOTNULL(op_desc);
if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) {
GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid.");
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] failed, op:%s(%s).",
op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
GELOGI("GetHcclDataType start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGI("GetHcclDataType start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str());
@@ -40,10 +41,10 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc,
if (op_desc->GetType() == HCOMRECEIVE) { if (op_desc->GetType() == HCOMRECEIVE) {
bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type); bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type);
if (ret == false) { if (ret == false) {
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail",
HCOM_ATTR_DATA_TYPE.c_str(),
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", HCOM_ATTR_DATA_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
GELOGE(PARAM_INVALID, "op:HcomReceive, op desc no attr: dtype.");
GELOGE(PARAM_INVALID, "[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_DATA_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
} else { } else {
@@ -55,13 +56,11 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc,
auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type)); auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type));
if (iter == kConstOpHcclDataType.end()) { if (iter == kConstOpHcclDataType.end()) {
REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value data_type:%s, not support in kConstOpHcclDataType now, " REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value data_type:%s, not support in kConstOpHcclDataType now, "
"check invalid", HCOM_ATTR_DATA_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str(),
ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str());
GELOGE(PARAM_INVALID,
"HcomOmeUtil:: Node: %s Optype: %s HcomDataType cann't support! Current Davinci Data Type : %s",
op_desc->GetName().c_str(), op_desc->GetType().c_str(),
ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str());
"check invalid", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(),
op_desc->GetType().c_str(), ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str());
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in op:%s(%s), value data_type:%s, "
"not support in kConstOpHcclDataType now", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(),
op_desc->GetType().c_str(), ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }


@@ -73,7 +72,7 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc,
Status HcomOmeUtil::GetHcclTypeSize(HcclDataType data_type, int32_t &size) { Status HcomOmeUtil::GetHcclTypeSize(HcclDataType data_type, int32_t &size) {
auto iter = kConstOpHcclDataTypeSize.find(data_type); auto iter = kConstOpHcclDataTypeSize.find(data_type);
GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID, GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID,
"HcomOmeUtil::HcomDataTypeSize , No DataTypeSize!");
"[Check][Param] param data_type:%d not find", data_type);


size = iter->second; size = iter->second;
return SUCCESS; return SUCCESS;
@@ -83,21 +82,22 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType
int &count) { int &count) {
GE_CHECK_NOTNULL(op_desc); GE_CHECK_NOTNULL(op_desc);
if (!IsHCOMOp(op_desc->GetType())) { if (!IsHCOMOp(op_desc->GetType())) {
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid",
op_desc->GetName().c_str(), op_desc->GetType().c_str());
GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Hcom operator.");
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid", op_desc->GetName().c_str(),
op_desc->GetType().c_str());
GELOGE(PARAM_INVALID, "[Check][Param] Op:%s(%s) is not hcom op", op_desc->GetName().c_str(),
op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
int64_t total_size = 0; int64_t total_size = 0;
int64_t align_size = 512; int64_t align_size = 512;
int32_t size = 0; int32_t size = 0;
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "GetHcomCount: GetHcclTypeSize fail!");
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "[Get][HcclTypeSize] fail, datatype:%d", data_type);
if (op_desc->GetType() == HCOMRECEIVE) { if (op_desc->GetType() == HCOMRECEIVE) {
for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) {
int64_t output_size = 0; int64_t output_size = 0;
GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i));
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), output_size), GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), output_size),
"Get size from TensorDesc failed, op: %s, output index: %zu.", op_desc->GetName().c_str(), i);
"[Get][Size] from TensorDesc failed, op:%s, output index:%zu.", op_desc->GetName().c_str(), i);
output_size = (output_size + align_size - 1) / align_size * align_size; output_size = (output_size + align_size - 1) / align_size * align_size;
total_size += output_size; total_size += output_size;
} }
@@ -107,42 +107,48 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType
int64_t block_size = 0; int64_t block_size = 0;
GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i));
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size),
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i);
// dynamic shape hccl op get size from output tensor desc // dynamic shape hccl op get size from output tensor desc
if (op_desc->HasAttr(ATTR_NAME_IS_UNKNOWN_SHAPE)) { if (op_desc->HasAttr(ATTR_NAME_IS_UNKNOWN_SHAPE)) {
GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i));
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), input_size), GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), input_size),
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i);
} }


GE_IF_BOOL_EXEC( GE_IF_BOOL_EXEC(
op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0; op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0;
GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_RANK_SIZE, rank_size), PARAM_INVALID, GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_RANK_SIZE, rank_size), PARAM_INVALID,
"get HCOM_ATTR_RANK_SIZE failed");
GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "rank size is zero");
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); GE_CHK_STATUS_RET(
ge::CheckInt64Uint32MulOverflow(shape_size, size), "Product of shape size and size beyond INT64_MAX");
"[Get][Attr] %s in op:%s(%s) failed", HCOM_ATTR_RANK_SIZE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str());
GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "[Check][Param] rank size is zero");
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize();
GE_CHK_STATUS_RET(ge::CheckInt64Uint32MulOverflow(shape_size, size),
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX, op:%s(%s)",
shape_size, size, op_desc->GetName().c_str(), op_desc->GetType().c_str());
block_size = (shape_size * size) / rank_size; block_size = (shape_size * size) / rank_size;
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX");
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size),
"[Check][Param] Total size:%ld is beyond the INT64_MAX, op:%s(%s)",
total_size, op_desc->GetName().c_str(), op_desc->GetType().c_str());
total_size = total_size + block_size; continue;); total_size = total_size + block_size; continue;);


int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize();
GELOGD("hcom util node %s inputsize %ld, shapesize %ld, datasize %d.", GELOGD("hcom util node %s inputsize %ld, shapesize %ld, datasize %d.",
op_desc->GetName().c_str(), input_size, shape_size, size); op_desc->GetName().c_str(), input_size, shape_size, size);
GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size),
"Product of shape size and size beyond INT64_MAX");
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX", shape_size, size);
GE_IF_BOOL_EXEC(is_allgather, block_size = shape_size * size;); GE_IF_BOOL_EXEC(is_allgather, block_size = shape_size * size;);
GE_IF_BOOL_EXEC(!is_allgather, block_size = (input_size + align_size - 1) / align_size * align_size;); GE_IF_BOOL_EXEC(!is_allgather, block_size = (input_size + align_size - 1) / align_size * align_size;);
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX");
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size),
"[Check][Param] Total size:%ld is beyond the INT64_MAX", total_size);
total_size = total_size + block_size; total_size = total_size + block_size;
} }
} }


GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero");
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "[Check][Param] Size is zero");
count = static_cast<int>(total_size / size); count = static_cast<int>(total_size / size);


GE_CHK_BOOL_EXEC(total_size % size == 0, return PARAM_INVALID, "total_size:%ld is not divisiable by size:%d.",
total_size, size);
GE_CHK_BOOL_EXEC(total_size % size == 0, return PARAM_INVALID,
"[Check][Param] total_size:%ld is not divisiable by size:%d.", total_size, size);


return SUCCESS; return SUCCESS;
} }
@@ -153,32 +159,34 @@ Status HcomOmeUtil::GetHorovodCount(const ge::ConstOpDescPtr &op_desc,
if (!IsHorovodOp(op_desc->GetType())) { if (!IsHorovodOp(op_desc->GetType())) {
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not horovod op, check invalid", REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not horovod op, check invalid",
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Horovod operator.");
GELOGE(PARAM_INVALID, "[Call][IsHorovodOp] failed, Op:%s(%s) is not horovod op",
op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
int64_t align_size = 512; int64_t align_size = 512;
int32_t size = 0; int32_t size = 0;
for (size_t i = 0; i < op_desc->GetInputsSize(); i++) { for (size_t i = 0; i < op_desc->GetInputsSize(); i++) {
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast<HcclDataType>(kernel_hccl_infos[i].dataType), size), GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast<HcclDataType>(kernel_hccl_infos[i].dataType), size),
"GetHorovodCount: GetHcclTypeSize fail!");
"[Call][GetHcclTypeSize] fail, op:%s(%s)",
op_desc->GetName().c_str(), op_desc->GetType().c_str());
int64_t input_size = 0; int64_t input_size = 0;
int64_t block_size = 0; int64_t block_size = 0;
GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i));
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size),
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i);


int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize();
GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size),
"Product of shape size and size beyond INT64_MAX");
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX", shape_size, size);
if (kernel_hccl_infos[0].hccl_type == HVDCALLBACKALLGATHER) { if (kernel_hccl_infos[0].hccl_type == HVDCALLBACKALLGATHER) {
block_size = shape_size * size; block_size = shape_size * size;
} else { } else {
block_size = (input_size + align_size - 1) / align_size * align_size; block_size = (input_size + align_size - 1) / align_size * align_size;
} }


GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero");
GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID, "block_size:%ld is not divisiable by size:%d.",
block_size, size);
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "[Check][Param] Size is zero");
GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID,
"[Check][Param] block_size:%ld is not divisiable by size:%d.", block_size, size);
kernel_hccl_infos[i].count = static_cast<int>(block_size / size); kernel_hccl_infos[i].count = static_cast<int>(block_size / size);
} }


@@ -191,7 +199,8 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc,
Status ret; Status ret;
ret = CheckKernelHcclInfo(op_desc, kernel_hccl_infos); ret = CheckKernelHcclInfo(op_desc, kernel_hccl_infos);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid.");
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] failed, the number of GETaskKernelHcclInfo is invalid, op:%s(%s).",
op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str());
@@ -200,7 +209,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc,
ret = GetHcomCount(op_desc, static_cast<HcclDataType>(kernel_hccl_infos[0].dataType), ret = GetHcomCount(op_desc, static_cast<HcclDataType>(kernel_hccl_infos[0].dataType),
kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count); kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "HcomOmeUtil:: Node: %s Optype: %s get the Hcom operator hccl count fail.",
GELOGE(ret, "[Call][GetHcomCount] Node:%s Optype:%s get the Hcom operator hccl count fail.",
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
@@ -210,7 +219,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc,
if (IsHorovodOp(op_desc->GetType())) { if (IsHorovodOp(op_desc->GetType())) {
ret = GetHorovodCount(op_desc, kernel_hccl_infos); ret = GetHorovodCount(op_desc, kernel_hccl_infos);
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s get the Horovod hccl operator count fail.",
GELOGE(PARAM_INVALID, "[Call][GetHorovodCount] Node:%s Optype:%s get the Horovod hccl operator count fail.",
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
@@ -225,11 +234,10 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl
if (IsHCOMOp(op_desc->GetType())) { if (IsHCOMOp(op_desc->GetType())) {
std::string hcom_op_type; std::string hcom_op_type;
GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type),
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail",
HCOM_ATTR_REDUCE_TYPE.c_str(),
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", HCOM_ATTR_REDUCE_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID, return PARAM_INVALID,
"HcomOmeUtil:: Node: %s Optype: %s Get HCOM_ATTR_REDUCE_TYPE fail, not support!",
"[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_REDUCE_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());


if (hcom_op_type == "min") { if (hcom_op_type == "min") {
@@ -244,7 +252,9 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl
REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now, " REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now, "
"check invalid", HCOM_ATTR_REDUCE_TYPE.c_str(), "check invalid", HCOM_ATTR_REDUCE_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str(), hcom_op_type.c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str(), hcom_op_type.c_str());
GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str());
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now",
HCOM_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(),
op_desc->GetType().c_str(), hcom_op_type.c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
} }
@@ -256,7 +266,7 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl
ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID, return PARAM_INVALID,
"HcomOmeUtil:: Node: %s Optype: %s Get ATTR_HOROVOD_ATTR_REDUCE_TYPE fail, not support!",
"[Get][Attr] %s in op:%s(%s) fail", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());


auto iter = kHorovodRedOpToHcclRedOp.find(static_cast<HorovodReduceOp>(horovod_op_type)); auto iter = kHorovodRedOpToHcclRedOp.find(static_cast<HorovodReduceOp>(horovod_op_type));
@@ -264,8 +274,8 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl
REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now, " REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now, "
"check invalid", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), "check invalid", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type);
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s HcomOpType cann't support! Current HcomOpType : %ld",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type);
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now",
ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type);
return PARAM_INVALID; return PARAM_INVALID;
} }
op_type = iter->second; op_type = iter->second;
@@ -281,7 +291,7 @@ Status HcomOmeUtil::GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &ro
HCOM_ATTR_ROOT_RANK.c_str(), HCOM_ATTR_ROOT_RANK.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID, return PARAM_INVALID,
"HcomOmeUtil::Node %s Optype: %s Get HCOM_ATTR_ROOT_INDEX fail, not support!",
"[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_ROOT_RANK.c_str(),
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());


return SUCCESS; return SUCCESS;
@@ -296,7 +306,7 @@ Status HcomOmeUtil::GetAllRootId(const ge::ConstOpDescPtr &op_desc,
int64_t root_id = 0; int64_t root_id = 0;
Status dmrt = GetHcclRootId(op_desc, root_id); Status dmrt = GetHcclRootId(op_desc, root_id);
if (dmrt != SUCCESS) { if (dmrt != SUCCESS) {
GELOGE(FAILED, "davinci_model: GetHcomRootId fail! domi error: %u", dmrt);
GELOGE(FAILED, "[Get][HcclRootId] fail! domi error: %u", dmrt);
return FAILED; return FAILED;
} }


@@ -324,7 +334,8 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc,
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1, " REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1, "
"check invalid", "check invalid",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size());
GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Hcom scenario, the number of GETaskKernelHcclInfo is invalid.");
GELOGE(PARAM_INVALID, "[Check][Param] Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size());
return PARAM_INVALID; return PARAM_INVALID;
} }


@@ -337,7 +348,9 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc,
"in op:%s(%s), check invalid", "in op:%s(%s), check invalid",
kernel_hccl_infos.size(), op_desc->GetInputsSize(), kernel_hccl_infos.size(), op_desc->GetInputsSize(),
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Horovod scenario, the number of GETaskKernelHcclInfo is invalid.");
GELOGE(PARAM_INVALID, "Param kernel_hccl_infos.size:%zu is empty or not equal to "
"input_desc size:%zu in op:%s(%s)", kernel_hccl_infos.size(), op_desc->GetInputsSize(),
op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }
} }
@@ -360,7 +373,7 @@ Status HcomOmeUtil::GetHorovodInputs(const ge::ConstOpDescPtr &op_desc,
} }


if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) {
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s the number of GETaskKernelHcclInfo is invalid.",
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] Node:%s Optype:%s the number of GETaskKernelHcclInfo is invalid.",
op_desc->GetName().c_str(), op_desc->GetType().c_str()); op_desc->GetName().c_str(), op_desc->GetType().c_str());
return PARAM_INVALID; return PARAM_INVALID;
} }


+ 1
- 1
ge/graph/manager/util/variable_accelerate_ctrl.cc View File

@@ -54,7 +54,7 @@ void VarAccelerateCtrl::SetVarChanged(const std::string &var_name) {
void VarAccelerateCtrl::AddGraph(uint32_t graph_id, const ComputeGraphPtr &compute_graph) { void VarAccelerateCtrl::AddGraph(uint32_t graph_id, const ComputeGraphPtr &compute_graph) {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
if (compute_graph == nullptr) { if (compute_graph == nullptr) {
GELOGE(PARAM_INVALID, "Failed to add graph %u, the compute graph is null", graph_id);
GELOGE(PARAM_INVALID, "[Check][Param] Failed to add graph %u, the compute graph is null", graph_id);
return; return;
} }
auto &var_names = graph_ids_to_var_names_[graph_id]; auto &var_names = graph_ids_to_var_names_[graph_id];


+ 1
- 2
ge/ir_build/option_utils.cc View File

@@ -253,8 +253,7 @@ bool CheckDynamicImagesizeInputShapeValid(map<string, vector<int64_t>> shape_map
for (auto str : split_set) { for (auto str : split_set) {
split_dim = StringUtils::Split(str, ','); split_dim = StringUtils::Split(str, ',');
if (split_dim.size() != static_cast<size_t>(kDynamicImageSizeNum)) { if (split_dim.size() != static_cast<size_t>(kDynamicImageSizeNum)) {
ErrorManager::GetInstance().ATCReportErrMessage("E10020", {"DynamicImageSizeNum"},
{std::to_string(kDynamicImageSizeNum)});
ErrorManager::GetInstance().ATCReportErrMessage("E10020");
GELOGE(ge::PARAM_INVALID, GELOGE(ge::PARAM_INVALID,
"[Check][DynamicImagesizeInputShape] invalid value:%s number of dimensions of each group must be %ld.", "[Check][DynamicImagesizeInputShape] invalid value:%s number of dimensions of each group must be %ld.",
dynamic_image_size.c_str(), kDynamicImageSizeNum); dynamic_image_size.c_str(), kDynamicImageSizeNum);


Loading…
Cancel
Save