diff --git a/ge/common/model_parser/model_parser.cc b/ge/common/model_parser/model_parser.cc index ce654887..5c68eea8 100644 --- a/ge/common/model_parser/model_parser.cc +++ b/ge/common/model_parser/model_parser.cc @@ -62,7 +62,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::LoadFro char *data = new (std::nothrow) char[len]; if (data == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Load model From file failed, bad memory allocation occur. (need:%u)", len); GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Load][ModelFromFile]Failed, " "bad memory allocation occur(need %u), file %s", len, model_path); REPORT_CALL_ERROR("E19999", "Load model from file %s failed, " @@ -90,33 +89,45 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::ParseMo GE_CHECK_NOTNULL(model.model_data); // Model length too small - GE_CHK_BOOL_RET_STATUS(model.model_len >= sizeof(ModelFileHeader), ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, - "Invalid model. Model data size %u must be greater than or equal to %zu.", model.model_len, - sizeof(ModelFileHeader)); + GE_CHK_BOOL_EXEC(model.model_len >= sizeof(ModelFileHeader), + REPORT_INPUT_ERROR("E10003", std::vector({"parameter", "value", "reason"}), + std::vector({"om", model.om_name.c_str(), "invalid om file"})); + GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, + "[Check][Param] Invalid model. Model data size %u must be greater than or equal to %zu.", + model.model_len, sizeof(ModelFileHeader)); + return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;); // Get file header auto file_header = reinterpret_cast(model.model_data); // Determine whether the file length and magic number match - GE_CHK_BOOL_RET_STATUS( - file_header->length == model.model_len - sizeof(ModelFileHeader) && file_header->magic == MODEL_FILE_MAGIC_NUM, - ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, - "Invalid model. file_header->length[%u] + sizeof(ModelFileHeader)[%zu] != model->model_len[%u] || " - "MODEL_FILE_MAGIC_NUM[%u] != file_header->magic[%u]", - file_header->length, sizeof(ModelFileHeader), model.model_len, MODEL_FILE_MAGIC_NUM, file_header->magic); - + GE_CHK_BOOL_EXEC(file_header->length == model.model_len - sizeof(ModelFileHeader) && + file_header->magic == MODEL_FILE_MAGIC_NUM, + REPORT_INPUT_ERROR("E10003", std::vector({"parameter", "value", "reason"}), + std::vector({"om", model.om_name.c_str(), "invalid om file"})); + GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, + "[Check][Param] Invalid model, file_header->length[%u] + sizeof(ModelFileHeader)[%zu] != " + "model->model_len[%u] || MODEL_FILE_MAGIC_NUM[%u] != file_header->magic[%u]", + file_header->length, sizeof(ModelFileHeader), model.model_len, + MODEL_FILE_MAGIC_NUM, file_header->magic); + return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;); Status res = SUCCESS; // Get data address uint8_t *data = reinterpret_cast(model.model_data) + sizeof(ModelFileHeader); if (file_header->is_encrypt == ModelEncryptType::UNENCRYPTED) { // Unencrypted model - GE_CHK_BOOL_RET_STATUS(model.key.empty(), ACL_ERROR_GE_PARAM_INVALID, - "Invalid param. model is unencrypted, but key is not empty."); - + if (!model.key.empty()) { + REPORT_INPUT_ERROR("E10003", std::vector({"parameter", "value", "reason"}), + std::vector({"om", model.om_name.c_str(), "invalid om file"})); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][Param] Invalid param, model is unencrypted, but key is not empty."); + return ACL_ERROR_GE_PARAM_INVALID; + } model_data = data; model_len = file_header->length; GELOGD("Model_len is %u, model_file_head_len is %zu.", model_len, sizeof(ModelFileHeader)); } else { GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param]Invalid, model encrypt type not supported"); - REPORT_CALL_ERROR("E19999","Invalid model, encrypt type not supported"); + REPORT_INPUT_ERROR("E10003", std::vector({"parameter", "value", "reason"}), + std::vector({"om", model.om_name.c_str(), "invalid om file"})); res = ACL_ERROR_GE_PARAM_INVALID; } diff --git a/ge/graph/load/graph_loader.cc b/ge/graph/load/graph_loader.cc index e4904614..7ee74d1d 100755 --- a/ge/graph/load/graph_loader.cc +++ b/ge/graph/load/graph_loader.cc @@ -33,12 +33,12 @@ Status GraphLoader::UnloadModel(uint32_t model_id) { Status ret = model_manager->Stop(model_id); if (ret != SUCCESS) { - GELOGE(ret, "UnloadModel: Stop failed. model id:%u", model_id); + GELOGE(ret, "[Stop][Model] failed. model id:%u", model_id); } ret = model_manager->Unload(model_id); if (ret != SUCCESS) { - GELOGE(ret, "UnloadModel: Unload failed. model id:%u", model_id); + GELOGE(ret, "[Unload][Model] failed. model id:%u", model_id); return ret; } GELOGI("UnLoad model success, model id:%u.", model_id); @@ -50,14 +50,13 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptrLoadModelOnline(model_id, ge_root_model_ptr, listener); if (ret != SUCCESS) { - GELOGE(ret, "LoadModel: Load failed. ret = %u", ret); + GELOGE(ret, "[Load][Model] Online failed. ret = %u, model_id:%u", ret, model_id); rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); } return ret; } @@ -81,31 +80,31 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptrStart(model_id); if (ret != SUCCESS) { if (model_manager->Unload(model_id) != SUCCESS) { - GELOGE(ret, "LoadModel: Unload failed while trying to unload after a failed start."); + GELOGE(ret, "[Unload][Model] failed while trying to unload after a failed start, model_id:%u.", model_id); } rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); } - GELOGE(ret, "LoadModel: Start failed."); + GELOGE(ret, "[Start][Model] failed, model_id:%u.", model_id); return ret; } rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); return RT_FAILED; } GELOGI("Load model online success, model_id:%u.", model_id); @@ -118,7 +117,7 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) { GE_CHECK_NOTNULL(model_manager); Status ret = model_manager->GetMaxUsedMemory(model_id, max_size); if (ret != SUCCESS) { - GELOGE(ret, "GetMaxUsedMemory: GetMaxUsedMemory failed."); + GELOGE(ret, "[Call][GetMaxUsedMemory] failed, model_id:%u.", model_id); return ret; } return SUCCESS; @@ -127,21 +126,20 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) { Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string &key_path, int32_t priority, ModelData &model_data) { if (!CheckInputPathValid(path)) { - GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str()); + GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "[Check][Param] model path is invalid:%s", path.c_str()); return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID; } GELOGI("Load model begin, model path is: %s", path.c_str()); if (!key_path.empty() && !CheckInputPathValid(key_path)) { - REPORT_INNER_ERROR("E19999", "Param key_path:%s empty or invalid", - key_path.c_str()); - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "decrypt_key path is invalid: %s", key_path.c_str()); + REPORT_INNER_ERROR("E19999", "Param key_path:%s empty or invalid", key_path.c_str()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param] decrypt_key path is invalid:%s", key_path.c_str()); return ACL_ERROR_GE_PARAM_INVALID; } Status ret = ModelParserBase::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data); if (ret != SUCCESS) { - GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret); + GELOGE(ret, "[Call][LoadFromFile] failed. ret = %u, path:%s, key path:%s", ret, path.c_str(), key_path.c_str()); if (model_data.model_data != nullptr) { delete[] static_cast(model_data.model_data); model_data.model_data = nullptr; @@ -156,18 +154,19 @@ Status GraphLoader::CommandHandle(const Command &command) { GE_CHECK_NOTNULL(model_manager); Status ret = model_manager->HandleCommand(command); if (ret != SUCCESS) { - GELOGE(ret, "CommandHandle: Command Handle failed."); + GELOGE(ret, "[Handle][Command] failed, module_index:%lu.", command.module_index); return ret; } } catch (std::bad_alloc &) { REPORT_INNER_ERROR("E19999", "Bad memory allocation occur"); - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Command handle failed, bad memory allocation occur !"); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Handle][Command] failed, " + "bad memory allocation occur, module_index:%lu.", command.module_index); return ACL_ERROR_GE_MEMORY_ALLOCATION; } catch (...) { REPORT_INNER_ERROR("E19999", "Some exceptions occur"); - GELOGE(FAILED, "Command handle failed, some exceptions occur !"); + GELOGE(FAILED, "[Handle][Command] failed, some exceptions occur, module_index:%lu.", command.module_index); return FAILED; } @@ -184,7 +183,7 @@ Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model Status ret = model_manager->LoadModelOffline( model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size); if (ret != SUCCESS) { - GELOGE(ret, "Load model failed, model_id:%u.", model_id); + GELOGE(ret, "[Load][Model] failed, model_id:%u.", model_id); return ret; } GELOGI("Load model success, model_id:%u.", model_id); @@ -210,7 +209,7 @@ Status GraphLoader::LoadModelWithQ(uint32_t &model_id, const ModelData &model_da GE_CHECK_NOTNULL(model_manager); Status ret = model_manager->LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids); if (ret != SUCCESS) { - GELOGE(ret, "Load model with queue failed, model_id:%u.", model_id); + GELOGE(ret, "[Load][Model] with queue failed, model_id:%u.", model_id); return ret; } @@ -237,7 +236,7 @@ Status GraphLoader::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asyn Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, input_data, input_desc, output_data, output_desc); if (ret != SUCCESS) { - GELOGE(ret, "Execute model failed, model_id:%u.", model_id); + GELOGE(ret, "[Execute][Model] failed, model_id:%u.", model_id); return ret; } @@ -250,7 +249,7 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) { if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); return RT_FAILED; } size_t total_mem = 0; @@ -258,14 +257,14 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) { rt_ret = rtMemGetInfo(&free_mem, &total_mem); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtMemGetInfo failed, ret:0x%X", rt_ret); - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtMemGetInfo] failed, ret:0x%X", rt_ret); return RT_FAILED; } rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); return RT_FAILED; } // Add small page memory size @@ -280,7 +279,8 @@ Status GraphLoader::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id, u GE_CHECK_NOTNULL(model_manager); Status ret = model_manager->DestroyAicpuKernel(session_id, model_id, sub_model_id); if (ret != SUCCESS) { - GELOGE(ret, "Destroy aicpu kernel failed."); + GELOGE(ret, "[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u.", + session_id, model_id, sub_model_id); return ret; } return SUCCESS; @@ -291,7 +291,7 @@ Status GraphLoader::DestroyAicpuSessionForInfer(uint32_t model_id) { GE_CHECK_NOTNULL(model_manager); Status ret = model_manager->DestroyAicpuSessionForInfer(model_id); if (ret != SUCCESS) { - GELOGE(ret, "Destroy aicpu serrion for infer failed."); + GELOGE(ret, "[Call][DestroyAicpuSessionForInfer] failed, model_id:%u.", model_id); return ret; } return SUCCESS; diff --git a/ge/graph/load/model_manager/model_manager.cc b/ge/graph/load/model_manager/model_manager.cc index 47d104f4..770fdb08 100755 --- a/ge/graph/load/model_manager/model_manager.cc +++ b/ge/graph/load/model_manager/model_manager.cc @@ -102,16 +102,14 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u auto kernel_size = sizeof(uint64_t) * (v_aicpu_kernel.size()); rtError_t rt_ret = rtMalloc(&aicpu_kernel_addr, kernel_size, RT_MEMORY_HBM); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, - REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret: 0x%X", - kernel_size, rt_ret); - GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", kernel_size, rt_ret); + GELOGE(RT_FAILED, "[Call][RtMalloc] failed, size:%zu, ret:0x%X", kernel_size, rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtMemcpy(aicpu_kernel_addr, kernel_size, v_aicpu_kernel.data(), kernel_size, RT_MEMCPY_HOST_TO_DEVICE); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, - REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", - kernel_size, rt_ret); - GELOGE(RT_FAILED, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", kernel_size, rt_ret); + GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, size:%zu, ret:0x%X", kernel_size, rt_ret); GE_CHK_RT(rtFree(aicpu_kernel_addr)); return RT_ERROR_TO_GE_STATUS(rt_ret);) uint64_t kernel_id_addr = static_cast(reinterpret_cast(aicpu_kernel_addr)); param_base.fwkKernelBase.fwk_kernel.kernelID = kernel_id_addr; @@ -122,9 +120,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u rtError_t rt_ret = rtMalloc(&(devicebase), sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret: 0x%X", - sizeof(STR_FWK_OP_KERNEL), rt_ret); - GELOGE(RT_FAILED, "malloc device memory failed. ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", sizeof(STR_FWK_OP_KERNEL), rt_ret); + GELOGE(RT_FAILED, "[Call][RtMalloc] failed. size:%zu, ret:0x%X", sizeof(STR_FWK_OP_KERNEL), rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -132,9 +129,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u rt_ret = rtMemcpy(devicebase, sizeof(STR_FWK_OP_KERNEL), ¶m_base, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", - sizeof(STR_FWK_OP_KERNEL), rt_ret); - GELOGE(RT_FAILED, "memory copy to device failed. ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", sizeof(STR_FWK_OP_KERNEL), rt_ret); + GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, size:%zu, ret:0x%X", sizeof(STR_FWK_OP_KERNEL), rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); GE_CHK_RT(rtFree(devicebase)); return RT_ERROR_TO_GE_STATUS(rt_ret); @@ -143,8 +139,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u rtStream_t stream = nullptr; rt_ret = rtStreamCreate(&stream, 0); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtStreamCreate failed, ret: 0x%X", rt_ret); - GELOGE(RT_FAILED, "create stream failed. ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtStreamCreate failed, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "[Create][Stream] failed. ret:0x%X", rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); GE_CHK_RT(rtFree(devicebase)); return RT_ERROR_TO_GE_STATUS(rt_ret); @@ -152,8 +148,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u rt_ret = rtKernelLaunchEx(devicebase, sizeof(STR_FWK_OP_KERNEL), 0, stream); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtKernelLaunchEx failed, ret: 0x%X", rt_ret); - GELOGE(RT_FAILED, "rtKernelLaunchEx failed. ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtKernelLaunchEx failed, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtKernelLaunchEx] failed. ret:0x%X", rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); GE_CHK_RT(rtFree(devicebase)); GE_CHK_RT(rtStreamDestroy(stream)); @@ -161,9 +157,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u } rt_ret = rtStreamSynchronize(stream); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize failed, ret: 0x%X", - rt_ret); - GELOGE(RT_FAILED, "rtStreamSynchronize failed. ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize failed, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtStreamSynchronize] failed. ret:0x%X", rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); GE_CHK_RT(rtFree(devicebase)); GE_CHK_RT(rtStreamDestroy(stream)); @@ -172,8 +167,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u if (aicpu_kernel_addr != nullptr) { rt_ret = rtFree(aicpu_kernel_addr); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtFree failed, ret: 0x%X", rt_ret); - GELOGE(RT_FAILED, "free memory failed. ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtFree failed, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "[Free][Memory] failed. ret:0x%X", rt_ret); GE_CHK_RT(rtFree(devicebase)); GE_CHK_RT(rtStreamDestroy(stream)); return RT_ERROR_TO_GE_STATUS(rt_ret); @@ -181,15 +176,15 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u } rt_ret = rtFree(devicebase); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtFree failed, ret: 0x%X", rt_ret); - GELOGE(RT_FAILED, "free memory failed. ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtFree failed, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "[Free][Memory] failed. ret:0x%X", rt_ret); GE_CHK_RT(rtStreamDestroy(stream)); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtStreamDestroy(stream); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtStreamDestroy failed, ret: 0x%X", rt_ret); - GELOGE(RT_FAILED, "rtStreamDestroy failed. ret: 0x%X", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtStreamDestroy failed, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtStreamDestroy] failed. ret:0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } return SUCCESS; @@ -235,9 +230,8 @@ ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) { auto it = model_map_.find(model_id); if (it == model_map_.end()) { - REPORT_INNER_ERROR("E19999", "Param model_id:%u can't find in model_map, check invalid", - model_id); - GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id); + REPORT_INNER_ERROR("E19999", "Param model_id:%u can't find in model_map, check invalid", model_id); + GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "[Check][Param] model id %u does not exists.", model_id); return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID; } uint64_t session_id = it->second->GetSessionId(); @@ -256,7 +250,8 @@ ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_ if (ret != SUCCESS) { REPORT_CALL_ERROR("E19999", "Call KernelLaunchEx fail, model_id:%u, sub_model_id:%u, session_id:%lu", model_id, sub_model_id, session_id); - GELOGE(FAILED, "Destroy aicpu kernel failed."); + GELOGE(FAILED, "[Call][KernelLaunchEx] fail, model_id:%u, sub_model_id:%u, session_id:%lu", + model_id, sub_model_id, session_id); return FAILED; } } @@ -304,7 +299,7 @@ ge::Status ModelManager::DoLoadHybridModelOnline(uint32_t model_id, const string hybrid_model->SetModelId(model_id); hybrid_model->SetDeviceId(GetContext().DeviceId()); hybrid_model->SetOmName(om_name); - GE_CHK_STATUS_RET(hybrid_model->Init(), "Failed to init hybrid model. model_id = %u", model_id); + GE_CHK_STATUS_RET(hybrid_model->Init(), "[Init][HybridModel] failed. model_id = %u", model_id); auto shared_model = std::shared_ptr(hybrid_model.release()); InsertModel(model_id, shared_model); return SUCCESS; @@ -315,7 +310,8 @@ bool ModelManager::IsNeedHybridLoad(ge::GeRootModel &ge_root_model) { if (root_graph == nullptr) { REPORT_INNER_ERROR("E19999", "root graph in param ge_root_model is nullptr, model_id:%u, " "check invalid", ge_root_model.GetModelId()); - GELOGE(FAILED, "no model on root model"); + GELOGE(FAILED, "[Check][Param] root graph in param ge_root_model is nullptr, model_id:%u", + ge_root_model.GetModelId()); return false; } bool is_shape_unknown = root_graph->GetGraphUnknownFlag(); @@ -354,9 +350,11 @@ Status ModelManager::UpdateSessionId(uint32_t model_id, GeModelPtr ge_model, std::shared_ptr &davinci_model, uint64_t &session_id) { uint64_t new_session_id; Status ret = GenSessionId(new_session_id); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Generate session_id for infer failed."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, + "[Generate][SessionId] for infer failed, model_id:%u.", model_id); ret = davinci_model->UpdateSessionId(new_session_id); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Update session_id for infer failed."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, + "[Update][SessionId] for infer failed, model_id:%u.", model_id); ge_model->InsertSessionMap(model_id, new_session_id); GELOGD("Update new session id: %lu.", new_session_id); session_id = new_session_id; @@ -382,7 +380,7 @@ bool ModelManager::HasVarNode(ComputeGraphPtr &compute_graph) const { /// Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr &ge_root_model, std::shared_ptr listener) { - GE_CHK_BOOL_RET_STATUS(listener.get() != nullptr, PARAM_INVALID, "Param incorrect, listener is null"); + GE_CHK_BOOL_RET_STATUS(listener.get() != nullptr, PARAM_INVALID, "[Check][Param] Param incorrect, listener is null"); if (model_id == INVALID_MODEL_ID) { GenModelId(&model_id); GELOGD("Generate new model_id:%u", model_id); @@ -444,13 +442,13 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr &davinci_model) { - GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "davinci_model ptr is null, id: %u", model_id); + GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "[Check][Param] davinci_model ptr is null, id:%u", model_id); std::lock_guard lock(map_mutex_); model_map_[model_id] = davinci_model; } void ModelManager::InsertModel(uint32_t model_id, shared_ptr &hybrid_model) { - GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", model_id); + GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "[Check][Param] hybrid_model ptr is null, id:%u", model_id); std::lock_guard lock(map_mutex_); hybrid_model_map_[model_id] = hybrid_model; } @@ -502,7 +500,7 @@ std::shared_ptr ModelManager::GetHybridModel(uint32_ } Status ModelManager::Unload(uint32_t model_id) { - GE_CHK_STATUS_RET(DeleteModel(model_id), "failed to unload model id: %u", model_id); + GE_CHK_STATUS_RET(DeleteModel(model_id), "[Delete][Model] failed, model id:%u", model_id); if (device_count > 0) { device_count--; GELOGI("Unload model %u success.", model_id); @@ -515,7 +513,7 @@ Status ModelManager::Unload(uint32_t model_id) { } Status ModelManager::UnloadModeldef(uint32_t model_id) { - GE_CHK_STATUS_RET(DeleteModel(model_id), "failed to unload modeldef id: %u", model_id); + GE_CHK_STATUS_RET(DeleteModel(model_id), "[Delete][Model] failed, model id: %u", model_id); return SUCCESS; } @@ -526,8 +524,8 @@ Status ModelManager::DataInput(const InputData &input_data, OutputData &output_d Status status = data_wrap->Init(input_data, output_data); if (status != SUCCESS) { - REPORT_CALL_ERROR("E19999", "Init InputDataWrapper failed, input data index: %u", input_data.index); - GELOGE(domi::PUSH_DATA_FAILED, "Init InputDataWrapper failed, input data index: %u.", input_data.index); + REPORT_CALL_ERROR("E19999", "Init InputDataWrapper failed, input data index:%u", input_data.index); + GELOGE(domi::PUSH_DATA_FAILED, "[Init][InputDataWrapper] failed, input data index:%u.", input_data.index); return domi::PUSH_DATA_FAILED; } @@ -536,7 +534,8 @@ Status ModelManager::DataInput(const InputData &input_data, OutputData &output_d std::shared_ptr model = GetModel(model_id); - GE_CHK_BOOL_RET_STATUS(model != nullptr, PARAM_INVALID, "Invalid model id %u in InputData! ", model_id); + GE_CHK_BOOL_RET_STATUS(model != nullptr, PARAM_INVALID, + "[Get][Model] failed, Invalid model id %u in InputData!", model_id); GE_IF_BOOL_EXEC(model->GetDataInputTid() == 0, model->SetDataInputTid(mmGetTid())); @@ -544,7 +543,7 @@ Status ModelManager::DataInput(const InputData &input_data, OutputData &output_d GE_CHECK_NOTNULL(inputer); if (inputer->Push(data_wrap) != SUCCESS) { REPORT_CALL_ERROR("E19999", "DataInputer queue is full, please call again later, model_id %u", model_id); - GELOGE(domi::DATA_QUEUE_ISFULL, "Data queue is full, please call again later, model_id %u ", model_id); + GELOGE(domi::DATA_QUEUE_ISFULL, "[Call][Push] Data queue is full, please call again later, model_id %u ", model_id); return domi::DATA_QUEUE_ISFULL; } GELOGD("Data input success, model id:%u", model_id); @@ -558,10 +557,9 @@ Status ModelManager::GetCurDynamicDims(const vector> &user_real_ GELOGD("Start get cur dynamic dims."); if (user_real_input_dims.size() != user_input_dims.size()) { REPORT_INNER_ERROR("E19999", "Param user_real_input_dims.size:%zu != user_input_dims.size:%zu, " - "check invalid", - user_real_input_dims.size(), user_input_dims.size()); + "check invalid", user_real_input_dims.size(), user_input_dims.size()); GELOGE(INTERNAL_ERROR, - "The input count of user: %zu should be equal to the data count of graph: %zu", + "[Check][Param] The input count of user:%zu should be equal to the data count of graph:%zu", user_real_input_dims.size(), user_input_dims.size()); return INTERNAL_ERROR; } @@ -571,8 +569,8 @@ Status ModelManager::GetCurDynamicDims(const vector> &user_real_ REPORT_INNER_ERROR("E19999", "Param user_real_input_dims[%zu].size:%zu != user_input_dims[%zu].size:%zu, " "check invalid", i, user_real_input_dims[i].size(), i, user_input_dims[i].second.size()); - GELOGE(INTERNAL_ERROR, - "The shape size: %zu of dynamic input: %s should be equal to the shape size of input shape: %zu.", + GELOGE(INTERNAL_ERROR, "[Check][Param] The shape size:%zu of dynamic input:%s " + "should be equal to the shape size of input shape:%zu.", user_real_input_dims[i].size(), user_input_dims[i].first.c_str(), user_input_dims[i].second.size()); return INTERNAL_ERROR; } @@ -594,7 +592,7 @@ Status ModelManager::GetCurDynamicDims(const vector> &user_real_ if (!cur_dynamic_dims_valid) { REPORT_INNER_ERROR("E19999", "cur dynamic dims is %s, not exist in options, check invalid", formats::JoinToString(cur_dynamic_dims).c_str()); - GELOGE(INTERNAL_ERROR, "Cur dynamic dims is %s, not exist in options.", + GELOGE(INTERNAL_ERROR, "[Check][Param] Cur dynamic dims is %s, not exist in options.", formats::JoinToString(cur_dynamic_dims).c_str()); return INTERNAL_ERROR; } @@ -632,15 +630,16 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vector(cur_dynamic_dims.size() * sizeof(int32_t)); - GE_CHK_BOOL_EXEC(memcpy_s(data.data, length, cur_dynamic_dims.data(), length) == EOK, return INTERNAL_ERROR, - "Failed to memcpy data."); + GE_CHK_BOOL_EXEC(memcpy_s(data.data, length, cur_dynamic_dims.data(), length) == EOK, + REPORT_CALL_ERROR("E19999", "memcpy data failed, size:%u", length); + return INTERNAL_ERROR, "[Memcpy][Data] failed, size:%u.", length); data.length = length; input_data.blobs.push_back(data); } @@ -654,21 +653,22 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vectorInit(input_data, output_data), return domi::PUSH_DATA_FAILED, - "Init InputDataWrapper failed,input data model_id is : %u.", model_id); + "[Init][InputDataWrapper] failed, input data model_id:%u.", model_id); if (hybrid_model != nullptr) { - GE_CHK_STATUS_RET(hybrid_model->EnqueueData(data_wrap), "Data queue is full, please call again later, model_id %u ", - model_id); + GE_CHK_STATUS_RET(hybrid_model->EnqueueData(data_wrap), + "[Enqueue][Data] Data queue is full, please call again later, model_id:%u", model_id); return SUCCESS; } - GE_CHK_BOOL_RET_STATUS(model != nullptr, PARAM_INVALID, "Invalid model id %u in InputData! ", model_id); + GE_CHK_BOOL_RET_STATUS(model != nullptr, PARAM_INVALID, + "[Check][Param] Invalid model id %u in InputData!", model_id); DataInputer *inputer = model->GetDataInputer(); GE_CHECK_NOTNULL(inputer); GE_CHK_STATUS_EXEC(inputer->Push(data_wrap), return domi::DATA_QUEUE_ISFULL, - "Data queue is full, please call again later, model_id %u ", model_id); + "[Call][Push] Data queue is full, please call again later, model_id %u ", model_id); GELOGD("Data input success, model id:%u", model_id); @@ -691,7 +691,8 @@ Status ModelManager::Start(uint32_t model_id) { std::shared_ptr davinci_model = GetModel(model_id); - GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "Invalid model id %u to start! ", model_id); + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, + "[Get][Model] failed, Invalid model id %u to start! ", model_id); Status status = davinci_model->ModelRunStart(); if (status == SUCCESS) { @@ -718,7 +719,8 @@ Status ModelManager::Stop(uint32_t model_id) { } std::shared_ptr davinci_model = GetModel(model_id); - GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "Invalid model id %u to stop!", model_id); + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, + "[Get][Model] failed, Invalid model id %u to stop!", model_id); Status status = davinci_model->ModelRunStop(); if (status == SUCCESS) { @@ -746,9 +748,8 @@ Status ModelManager::HandleCommand(const Command &command) { auto iter = cmds.find(command.cmd_type); if (iter == cmds.end()) { - REPORT_INNER_ERROR("E19999", "Unsupported command:%s check", - command.cmd_type.c_str()); - GELOGE(PARAM_INVALID, "Unsupported command: %s", command.cmd_type.c_str()); + REPORT_INNER_ERROR("E19999", "Unsupported command:%s check", command.cmd_type.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param] Unsupported command:%s", command.cmd_type.c_str()); return PARAM_INVALID; } else { return iter->second(command); @@ -761,8 +762,8 @@ Status ModelManager::GetModelByCmd(const Command &command, REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu < kCmdParSize:%u, command_type:%s, " "check invalid", command.cmd_params.size(), kCmdParSize, command.cmd_type.c_str()); - GELOGE(PARAM_INVALID, "When the cmd_type is '%s', the size of cmd_params must larger than 2.", - command.cmd_type.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param] When the cmd_type is '%s', the size of cmd_params must larger than 2.", + command.cmd_type.c_str()); return PARAM_INVALID; } @@ -773,19 +774,16 @@ Status ModelManager::GetModelByCmd(const Command &command, try { model_id = std::stoi(value); } catch (std::invalid_argument &) { - REPORT_INNER_ERROR("E19999", "%s param:%s, check invalid", PROFILE_MODEL_ID.c_str(), - value.c_str()); - GELOGE(PARAM_INVALID, "Model id: %s is invalid.", value.c_str()); + REPORT_INNER_ERROR("E19999", "%s param:%s, check invalid", PROFILE_MODEL_ID.c_str(), value.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param] Model id:%s is invalid.", value.c_str()); return PARAM_INVALID; } catch (std::out_of_range &) { - REPORT_INNER_ERROR("E19999", "%s param:%s, check out of range", PROFILE_MODEL_ID.c_str(), - value.c_str()); - GELOGE(PARAM_INVALID, "Model id: %s is out of range.", value.c_str()); + REPORT_INNER_ERROR("E19999", "%s param:%s, check out of range", PROFILE_MODEL_ID.c_str(), value.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param] Model id:%s is out of range.", value.c_str()); return PARAM_INVALID; } catch (...) { - REPORT_INNER_ERROR("E19999", "%s param:%s, check cannot change to int", - PROFILE_MODEL_ID.c_str(), value.c_str()); - GELOGE(FAILED, "Model id: %s cannot change to int.", value.c_str()); + REPORT_INNER_ERROR("E19999", "%s param:%s, check cannot change to int", PROFILE_MODEL_ID.c_str(), value.c_str()); + GELOGE(FAILED, "[Check][Param] Model id:%s cannot change to int.", value.c_str()); return FAILED; } @@ -793,15 +791,13 @@ Status ModelManager::GetModelByCmd(const Command &command, GE_CHECK_NOTNULL(model_manager); davinci_model = model_manager->GetModel(static_cast(model_id)); if (davinci_model == nullptr) { - REPORT_INNER_ERROR("E19999", "GetModel from model_manager fail, model_id:%u", - model_id); - GELOGE(FAILED, "Model id: %d is invaild or model is not loaded.", model_id); + REPORT_INNER_ERROR("E19999", "GetModel from model_manager fail, model_id:%u", model_id); + GELOGE(FAILED, "[Get][Model] failed, Model id:%d is invaild or model is not loaded.", model_id); return FAILED; } } else { - REPORT_INNER_ERROR("E19999", "Fisrt cmd_param not %s, check invalid", - PROFILE_MODEL_ID.c_str()); - GELOGE(FAILED, "The model_id parameter is not found in the command."); + REPORT_INNER_ERROR("E19999", "Fisrt cmd_param not %s, check invalid", PROFILE_MODEL_ID.c_str()); + GELOGE(FAILED, "[Check][Param] The model_id parameter is not found in the command."); return FAILED; } @@ -817,7 +813,8 @@ Status ModelManager::HandleProfModelSubscribeCommand(const Command &command) { if (ProfilingManager::Instance().ProfModelSubscribe(command.module_index, static_cast(davinci_model.get())) != SUCCESS) { - GELOGE(FAILED, "Handle prof model subscribe failed."); + GELOGE(FAILED, "[Handle][ProfModelSubscribe] failed, module_index:%lu.", + command.module_index); return FAILED; } @@ -832,7 +829,7 @@ Status ModelManager::HandleProfModelUnsubscribeCommand(const Command &command) { } if (ProfilingManager::Instance().ProfModelUnsubscribe(static_cast(davinci_model.get())) != SUCCESS) { - GELOGE(FAILED, "Handle prof model unsubscribe failed."); + GELOGE(FAILED, "[Handle][ProfModelUnsubscribe] failed."); return FAILED; } @@ -842,7 +839,7 @@ Status ModelManager::HandleProfModelUnsubscribeCommand(const Command &command) { Status ModelManager::HandleProfInitCommand(const Command &command) { uint64_t module_index = command.module_index; if (ProfilingManager::Instance().ProfInit(module_index) != SUCCESS) { - GELOGE(FAILED, "Handle prof init failed."); + GELOGE(FAILED, "[Handle][ProfInit] failed, module_index:%lu.", module_index); return FAILED; } return SUCCESS; @@ -850,7 +847,7 @@ Status ModelManager::HandleProfInitCommand(const Command &command) { Status ModelManager::HandleProfFinalizeCommand(const Command &command) { if (ProfilingManager::Instance().ProfFinalize() != SUCCESS) { - GELOGE(FAILED, "Handle prof finalize failed."); + GELOGE(FAILED, "[Handle][ProfFinalize] failed."); return FAILED; } return SUCCESS; @@ -866,13 +863,14 @@ Status ModelManager::HandleProfStartCommand(const Command &command) { if (command.cmd_params.size() < kProfStartCmdParaSize) { REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu < %zu, check invalid", command.cmd_params.size(), kProfStartCmdParaSize); - GELOGE(PARAM_INVALID, "When the cmd_type is 'profile start', the size of cmd_params must larger than 2."); + GELOGE(PARAM_INVALID, "[Check][Param] When the cmd_type is 'profile start', " + "the size:%zu of cmd_params must larger than 2.", command.cmd_params.size()); return PARAM_INVALID; } if (command.cmd_params.size() > kProfCmdParaMaxSize) { REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu > %zu, check invalid", command.cmd_params.size(), kProfCmdParaMaxSize); - GELOGE(PARAM_INVALID, "Command para size[%zu] larger than max[1000].", command.cmd_params.size()); + GELOGE(PARAM_INVALID, "[Check][Param] Command param size[%zu] larger than max[1000].", command.cmd_params.size()); return PARAM_INVALID; } @@ -886,7 +884,7 @@ Status ModelManager::HandleProfStartCommand(const Command &command) { } uint64_t module_index = command.module_index; if (ProfilingManager::Instance().ProfStartProfiling(module_index, cmd_params_map) != SUCCESS) { - GELOGE(FAILED, "Handle prof start failed."); + GELOGE(FAILED, "[Handle][ProfStartProfiling] failed, module_index:%lu.", module_index); return FAILED; } return SUCCESS; @@ -896,13 +894,14 @@ Status ModelManager::HandleProfStopCommand(const Command &command) { if (command.cmd_params.size() < kProfStartCmdParaSize) { REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu < %zu, check invalid", command.cmd_params.size(), kProfStartCmdParaSize); - GELOGE(PARAM_INVALID, "When the cmd_type is 'profile stop', the size of cmd_params must larger than 2."); + GELOGE(PARAM_INVALID, "[Check][Param] When the cmd_type is 'profile stop', " + "the size:%zu of cmd_params must larger than 2.", command.cmd_params.size()); return PARAM_INVALID; } if (command.cmd_params.size() > kProfCmdParaMaxSize) { REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu > %zu, check invalid", command.cmd_params.size(), kProfCmdParaMaxSize); - GELOGE(PARAM_INVALID, "Command para size[%zu] larger than max[1000].", command.cmd_params.size()); + GELOGE(PARAM_INVALID, "[Check][Param] Command param size[%zu] larger than max[1000].", command.cmd_params.size()); return PARAM_INVALID; } @@ -916,7 +915,7 @@ Status ModelManager::HandleProfStopCommand(const Command &command) { } uint64_t module_index = command.module_index; if (ProfilingManager::Instance().ProfStopProfiling(module_index, cmd_params_map) != SUCCESS) { - GELOGE(FAILED, "Handle prof finalize failed."); + GELOGE(FAILED, "[Handle][ProfStopProfiling] failed, module_index:%lu.", module_index); return FAILED; } return SUCCESS; @@ -927,9 +926,8 @@ static Status ParserPara(const Command &command, const string &dump_key, string if (iter != command.cmd_params.end()) { ++iter; if (iter == command.cmd_params.end()) { - REPORT_INNER_ERROR("E19999", "dump_key:%s can't find in command.param, check invalid", - dump_key.c_str()); - GELOGE(PARAM_INVALID, "Invalid access."); + REPORT_INNER_ERROR("E19999", "dump_key:%s can't find in command.param, check invalid", dump_key.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param] dump_key:%s can't find in command.param, check invalid", dump_key.c_str()); return PARAM_INVALID; } dump_value = *iter; @@ -939,9 +937,9 @@ static Status ParserPara(const Command &command, const string &dump_key, string Status ModelManager::HandleDumpCommand(const Command &command) { if (command.cmd_params.size() % kDumpCmdPairSize != 0) { - REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu MOD 2 != 0, check invalid", - command.cmd_params.size()); - GELOGE(PARAM_INVALID, "When the cmd_type is 'dump', the size of cmd_params must be a even number."); + REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu MOD 2 != 0, check invalid", command.cmd_params.size()); + GELOGE(PARAM_INVALID, "[Check][Param] When the cmd_type is 'dump', " + "the size:%zu of cmd_params must be a even number.", command.cmd_params.size()); return PARAM_INVALID; } @@ -953,14 +951,14 @@ Status ModelManager::HandleDumpCommand(const Command &command) { auto ret = ParserPara(command, DUMP_STATUS, dump_status); if (ret != SUCCESS) { - GELOGE(PARAM_INVALID, "parser dump status failed"); + GELOGE(PARAM_INVALID, "[Parser][DumpStatus] failed, ret:%d", ret); return FAILED; } GELOGI("dump status = %s.", dump_status.c_str()); ret = ParserPara(command, DUMP_MODEL, dump_model); if (ret != SUCCESS) { - GELOGE(PARAM_INVALID, "parser dump model failed"); + GELOGE(PARAM_INVALID, "[Parser][DumpModel] failed, ret:%d", ret); return FAILED; } GELOGI("dump model = %s.", dump_model.c_str()); @@ -979,7 +977,7 @@ Status ModelManager::HandleDumpCommand(const Command &command) { ret = ParserPara(command, DUMP_FILE_PATH, dump_path); if (ret != SUCCESS) { - GELOGE(PARAM_INVALID, "parser dump path failed"); + GELOGE(PARAM_INVALID, "[Parser][DumpPath] failed, ret:%d", ret); return FAILED; } if (!dump_path.empty() && dump_path[dump_path.size() - 1] != '/') { @@ -990,7 +988,7 @@ Status ModelManager::HandleDumpCommand(const Command &command) { ret = ParserPara(command, DUMP_MODE, dump_mode); if (ret != SUCCESS) { - GELOGE(PARAM_INVALID, "parser dump mode failed"); + GELOGE(PARAM_INVALID, "[Parser][DumpMode] failed, ret:%d", ret); return FAILED; } GELOGI("dump mode = %s", dump_mode.c_str()); @@ -1010,8 +1008,8 @@ Status ModelManager::GetMaxUsedMemory(const uint32_t model_id, uint64_t &max_siz } std::shared_ptr davinci_model = GetModel(model_id); - GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "GetMaxUsedMemory Failed, Invalid model id %u!", - model_id); + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, + "[Get][Model] failed, Invalid model id:%u!", model_id); max_size = davinci_model->TotalMemSize(); return SUCCESS; @@ -1020,8 +1018,8 @@ Status ModelManager::GetMaxUsedMemory(const uint32_t model_id, uint64_t &max_siz Status ModelManager::GetInputOutputDescInfo(const uint32_t model_id, vector &input_desc, vector &output_desc) { std::shared_ptr davinci_model = GetModel(model_id); - GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "GetInputOutputDescInfo Failed, Invalid model id %u!", - model_id); + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, + "[Get][Model] failed, Invalid model id %u!", model_id); return davinci_model->GetInputOutputDescInfo(input_desc, output_desc); } @@ -1038,7 +1036,7 @@ Status ModelManager::GetInputOutputDescInfo(const uint32_t model_id, vector davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetInputOutputDescInfo Failed, Invalid model id %u!", model_id); + "[Get][Model] Failed, Invalid model id %u!", model_id); return davinci_model->GetInputOutputDescInfo(input_desc, output_desc, inputFormats, outputFormats, new_model_desc); } @@ -1059,7 +1057,7 @@ Status ModelManager::GetDynamicBatchInfo(const uint32_t model_id, std::vector davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetDynamicBatchInfo failed, Invalid model id %u!", model_id); + "[Get][Model] failed, Invalid model id %u!", model_id); return davinci_model->GetDynamicBatchInfo(batch_info, dynamic_type); } @@ -1074,7 +1072,7 @@ Status ModelManager::GetDynamicBatchInfo(const uint32_t model_id, std::vector> &batch_info) { std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetCombinedDynamicDims Failed, Invalid Model ID %u!", model_id); + "[Get][Model] Failed, Invalid Model ID %u!", model_id); davinci_model->GetCombinedDynamicDims(batch_info); return SUCCESS; @@ -1097,7 +1095,7 @@ Status ModelManager::GetUserDesignateShapeOrder(const uint32_t model_id, auto davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetUserDesignateShapeOrder Failed, Invalid Model ID %u!", model_id) + "[Get][Model] Failed, Invalid Model ID %u!", model_id) davinci_model->GetUserDesignateShapeOrder(user_input_shape_order); return SUCCESS; } @@ -1105,7 +1103,7 @@ Status ModelManager::GetUserDesignateShapeOrder(const uint32_t model_id, Status ModelManager::GetCurShape(const uint32_t model_id, std::vector &batch_info, int32_t &dynamic_type) { auto davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetCurShape Failed, Invalid Model ID %u!", model_id); + "[Get][Model] Failed, Invalid Model ID %u!", model_id); davinci_model->GetCurShape(batch_info, dynamic_type); return SUCCESS; } @@ -1134,7 +1132,7 @@ Status ModelManager::GetModelAttr(uint32_t model_id, std::vector &dynami std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetModelAttr Failed, Invalid Model ID %u!", model_id); + "[Get][Model] Failed, Invalid Model ID %u!", model_id); davinci_model->GetModelAttr(dynamic_output_shape_info); return SUCCESS; } @@ -1150,14 +1148,14 @@ Status ModelManager::GetModelAttr(uint32_t model_id, std::vector &dynami Status ModelManager::GetAippInfo(const uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info) { std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetAIPPInfo failed, invalid model_id is %u.", model_id); + "[Get][Model] failed, invalid model_id is %u.", model_id); return davinci_model->GetAippInfo(index, aipp_info); } Status ModelManager::GetAippType(uint32_t model_id, uint32_t index, InputAippType &type, size_t &aipp_index) { std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetAIPPInfo failed, invalid model_id is %u.", model_id); + "[Get][Model] failed, invalid model_id is %u.", model_id); return davinci_model->GetAippType(index, type, aipp_index); } @@ -1173,7 +1171,7 @@ Status ModelManager::GenSessionId(uint64_t &session_id) { mmTimeval tv; if (mmGetTimeOfDay(&tv, nullptr) != 0) { REPORT_CALL_ERROR("E19999", "Call mmGetTimeOfDay fail. errmsg:%s", strerror(errno)); - GELOGE(INTERNAL_ERROR, "Failed to get current time."); + GELOGE(INTERNAL_ERROR, "[Call][MmGetTimeOfDay] fail. errmsg:%s", strerror(errno)); return INTERNAL_ERROR; } uint64_t timestamp = static_cast(tv.tv_sec * kTimeSpecMiro + tv.tv_usec); // 1000000us @@ -1192,7 +1190,8 @@ Status ModelManager::GenSessionId(uint64_t &session_id) { Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model, shared_ptr listener, void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) { GE_CHK_BOOL_RET_STATUS(model.key.empty() || mmAccess2(model.key.c_str(), M_F_OK) == EN_OK, - ACL_ERROR_GE_PARAM_INVALID, "Input key file path %s is invalid, %s", model.key.c_str(), strerror(errno)); + ACL_ERROR_GE_PARAM_INVALID, + "[Check][Param] Input key file path %s is invalid, %s", model.key.c_str(), strerror(errno)); GenModelId(&model_id); mmTimespec timespec = mmGetTickCount(); @@ -1200,14 +1199,14 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model ModelHelper model_helper; Status ret = model_helper.LoadRootModel(model); if (ret != SUCCESS) { - GELOGE(ret, "load model failed."); + GELOGE(ret, "[Load][RootModel] failed, ret:%d, model_id:%u.", ret, model_id); return ret; } if (model_helper.GetModelType()) { bool is_shape_unknown = false; GE_CHK_STATUS_RET(model_helper.GetGeRootModel()->CheckIsUnknownShape(is_shape_unknown), - "CheckIsUnknownShape failed, model id:%u", model_id); + "[Check][IsUnknownShape] failed, model id:%u", model_id); if (is_shape_unknown || GetContext().GetHostExecFlag()) { return DoLoadHybridModelOnline(model_id, model.om_name, model_helper.GetGeRootModel(), listener); } @@ -1218,7 +1217,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model shared_ptr davinci_model = MakeShared(model.priority, listener); if (davinci_model == nullptr) { REPORT_CALL_ERROR("E19999", "New DavinciModel fail"); - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Make shared failed"); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[New][DavinciModel] fail"); return ACL_ERROR_GE_MEMORY_ALLOCATION; } davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * kTimeSpecNano + @@ -1234,7 +1233,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model rtError_t rt_ret = rtGetDevice(&device_id); if (rt_ret != RT_ERROR_NONE || device_id < 0) { REPORT_CALL_ERROR("E19999", "Call rtGetDevice failed, ret = 0x%X", rt_ret); - GELOGE(rt_ret, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); + GELOGE(rt_ret, "[Call][RtGetDevice] failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); return RT_ERROR_TO_GE_STATUS(rt_ret); } davinci_model->SetDeviceId(device_id); @@ -1250,12 +1249,13 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model /// Update session_id for infer in load model to avoid the same session_id. uint64_t new_session_id; ret = GenSessionId(new_session_id); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "Generate session_id for inference failed."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "[Generate][SessionId] for inference failed, ret:%d.", ret); ret = davinci_model->UpdateSessionId(new_session_id); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "Update session_id for inference failed."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, + "[Update][SessionId] for inference failed, session id:%lu.", new_session_id); ret = davinci_model->Init(dev_ptr, mem_size, weight_ptr, weight_size); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "DavinciInit failed."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "[Init][DavinciModel] failed, ret:%d.", ret); InsertModel(model_id, davinci_model); @@ -1280,26 +1280,27 @@ Status ModelManager::LoadModelWithQ(uint32_t &model_id, const ModelData &model_d const std::vector &input_queue_ids, const std::vector &output_queue_ids) { GE_CHK_BOOL_RET_STATUS(model_data.key.empty() || mmAccess2(model_data.key.c_str(), M_F_OK) == EN_OK, - ACL_ERROR_GE_PARAM_INVALID, "input key file path %s is not valid, %s", + ACL_ERROR_GE_PARAM_INVALID, + "[Check][Param] input key file path %s is not valid, %s", model_data.key.c_str(), strerror(errno)); ModelHelper model_helper; Status ret = model_helper.LoadModel(model_data); if (ret != SUCCESS) { - GELOGE(ret, "load model failed."); + GELOGE(ret, "[Load][Model] failed."); return ret; } shared_ptr davinci_model = MakeShared(model_data.priority, nullptr); if (davinci_model == nullptr) { REPORT_CALL_ERROR("E19999", "New DavinciModel fail"); - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "create model failed."); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][Model] failed."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } ret = davinci_model->Assign(model_helper.GetGeModel()); if (ret != SUCCESS) { - GELOGE(ret, "assign model failed."); + GELOGE(ret, "[Assign][Model] failed, ret:%d.", ret); return ret; } @@ -1308,15 +1309,17 @@ Status ModelManager::LoadModelWithQ(uint32_t &model_id, const ModelData &model_d /// Update session_id for infer in load model to avoid the same session_id. uint64_t new_session_id; ret = GenSessionId(new_session_id); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Generate session_id for infer failed."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, + "[Generate][SessionId] for infer failed, ret:%d.", ret); ret = davinci_model->UpdateSessionId(new_session_id); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Update session_id for infer failed."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, + "[Update][SessionId] for infer failed, SessionId:%lu.", new_session_id); GenModelId(&model_id); davinci_model->SetId(model_id); ret = davinci_model->SetQueIds(input_queue_ids, output_queue_ids); if (ret != SUCCESS) { - GELOGE(ret, "set model queue ids failed."); + GELOGE(ret, "[Set][Ids] for model queue failed, ret:%d, model_id:%u.", ret, model_id); return ret; } @@ -1324,7 +1327,7 @@ Status ModelManager::LoadModelWithQ(uint32_t &model_id, const ModelData &model_d ret = davinci_model->Init(); if (ret != SUCCESS) { - GELOGE(ret, "init model failed."); + GELOGE(ret, "[Init][Model] failed, ret:%d, model_id:%u.", ret, model_id); return ret; } @@ -1362,7 +1365,7 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "Invalid model id %u, check whether model has been loaded or not.", model_id); + "[Get][Model] Invalid model id %u, check whether model has been loaded or not.", model_id); if (davinci_model->NeedDestroyAicpuKernel()) { GELOGI("Start to destroy specified aicpu kernel."); @@ -1413,9 +1416,8 @@ Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_ rtContext_t rt_cur_ctx = nullptr; auto rt_error = rtCtxGetCurrent(&rt_cur_ctx); if (rt_error != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, ret = 0x%X", - rt_error); - GELOGE(RT_FAILED, "get current context failed, runtime result is %d", static_cast(rt_error)); + REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, ret = 0x%X", rt_error); + GELOGE(RT_FAILED, "[Call][RtCtxGetCurrent] failed, runtime result is %d", static_cast(rt_error)); return RT_FAILED; } @@ -1450,9 +1452,8 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { rtContext_t rt_cur_ctx = nullptr; auto rt_error = rtCtxGetCurrent(&rt_cur_ctx); if (rt_error != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, ret = 0x%X", - rt_error); - GELOGE(RT_FAILED, "get current context failed, runtime result is %d", static_cast(rt_error)); + REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, ret = 0x%X", rt_error); + GELOGE(RT_FAILED, "[Call][RtCtxGetCurrent] failed, runtime result is %d", static_cast(rt_error)); return RT_FAILED; } uintptr_t resource_id = reinterpret_cast(rt_cur_ctx); @@ -1477,17 +1478,15 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret = 0x%X", - aicpu_data_length, status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret = 0x%X", aicpu_data_length, status); + GELOGE(RT_FAILED, "[Call][RtMalloc] failed, size:%u, ret = 0x%X", aicpu_data_length, status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(d_aicpu_data); status = rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", - so_name.size(), status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", so_name.size(), status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%zu, ret = 0x%X", so_name.size(), status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(d_so_name); @@ -1509,9 +1508,8 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { uint32_t args_size = sizeof(CustAicpuSoBuf) * v_cust_so.size(); status = rtMalloc(&args, args_size, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", - args_size, status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", args_size, status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%u, ret = 0x%X", args_size, status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(args); @@ -1525,9 +1523,8 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs); status = rtMalloc(&batch_args, batch_args_size, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", - batch_args_size, status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", batch_args_size, status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%u, ret = 0x%X", batch_args_size, status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(batch_args); @@ -1539,9 +1536,8 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { status = rtStreamSynchronize(stream); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize fail, ret = 0x%X", - status); - GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize fail, ret = 0x%X", status); + GELOGE(RT_FAILED, "[Call][RtStreamSynchronize] fail, ret = 0x%X", status); return RT_ERROR_TO_GE_STATUS(status); } std::function callback = [&]() { @@ -1556,12 +1552,14 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { } Status ModelManager::ClearAicpuSo() { - GE_CHK_STATUS_RET(LaunchKernelCustAicpuSo(kDeleteCustOp), "delete cust op so failed."); + GE_CHK_STATUS_RET(LaunchKernelCustAicpuSo(kDeleteCustOp), + "[Call][LaunchKernelCustAicpuSo] delete cust op so failed."); return SUCCESS; } Status ModelManager::LaunchCustAicpuSo() { - GE_CHK_STATUS_RET(LaunchKernelCustAicpuSo(kBatchLoadBuf), "launch cust op so failed."); + GE_CHK_STATUS_RET(LaunchKernelCustAicpuSo(kBatchLoadBuf), + "[Call][LaunchKernelCustAicpuSo] launch cust op so failed."); return SUCCESS; } @@ -1577,21 +1575,21 @@ Status ModelManager::GetModelMemAndWeightSize(const ModelData &model, size_t &me uint8_t *model_data = nullptr; uint32_t model_len = 0; Status ret = ModelParserBase::ParseModelContent(model, model_data, model_len); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_PARAM_INVALID, "parse model content failed!"); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_PARAM_INVALID, "[Parse][ModelContent] failed!"); OmFileLoadHelper om_file_helper; ret = om_file_helper.Init(model_data, model_len); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "om file helperInit failed!"); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "[Init][OmFileHelper] failed, ret:%d", ret); auto partition_table = reinterpret_cast(model_data); if (partition_table->num == 1) { REPORT_INNER_ERROR("E19999", "partition_table num in model_data is 1, check invalid"); - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "om model is error,please use executable om model"); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param] om model is error, please use executable om model"); return ACL_ERROR_GE_PARAM_INVALID; } ModelPartition task_partition; if (om_file_helper.GetModelPartition(ModelPartitionType::TASK_INFO, task_partition) != SUCCESS) { - GELOGE(ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED, "get task model partition failed."); + GELOGE(ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED, "[Get][ModelPartition] failed."); return ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED; } @@ -1601,7 +1599,7 @@ Status ModelManager::GetModelMemAndWeightSize(const ModelData &model, size_t &me } if (task_partition.size != 0) { if (!ReadProtoFromArray(task_partition.data, static_cast(task_partition.size), model_task_def.get())) { - GELOGE(ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED, "ReadProtoFromArray failed."); + GELOGE(ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED, "[Read][Proto] From Array failed."); return ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED; } } @@ -1609,7 +1607,7 @@ Status ModelManager::GetModelMemAndWeightSize(const ModelData &model, size_t &me ModelPartition partition_weight; ret = om_file_helper.GetModelPartition(ModelPartitionType::WEIGHTS_DATA, partition_weight); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED, - "Get weight partition failed. ret = %u", ret); + "[Get][ModelPartition] failed. ret = %u", ret); mem_size = model_task_def->memory_size(); weight_size = partition_weight.size; @@ -1627,8 +1625,7 @@ void ModelManager::GenModelId(uint32_t *id) { Status ModelManager::GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info) { std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetOrigInputInfo failed, invalid model_id is %u.", - model_id); + "[Get][Model] failed, invalid model_id is %u.", model_id); return davinci_model->GetOrigInputInfo(index, orig_input_info); } @@ -1638,7 +1635,7 @@ Status ModelManager::GetAllAippInputOutputDims(uint32_t model_id, uint32_t index std::vector &output_dims) { std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetAllAippInputOutputDims failed, invalid model_id is %u.", model_id); + "[Get][Model] failed, invalid model_id is %u.", model_id); return davinci_model->GetAllAippInputOutputDims(index, input_dims, output_dims); } @@ -1653,7 +1650,7 @@ ge::Status ModelManager::SyncExecuteModel(uint32_t model_id, const vector &options dump_exception_flag_ = true; rtError_t rt_ret = rtSetTaskFailCallback(reinterpret_cast(ExceptionCallback)); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtSetTaskFailCallback fail, ret = 0x%X", - rt_ret); - GELOGE(RT_FAILED, "rtSetTaskFailCallback failed"); + REPORT_CALL_ERROR("E19999", "Call rtSetTaskFailCallback fail, ret = 0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtSetTaskFailCallback] fail, ret = 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } } else { @@ -1743,9 +1739,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op // malloc sysOpInfoList in SysOpCheckInfo status = rtMalloc(&d_req_op_list, op_nums * sizeof(SysOpInfo), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", - op_nums * sizeof(SysOpInfo), status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", op_nums * sizeof(SysOpInfo), status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%zu, ret = 0x%X", op_nums * sizeof(SysOpInfo), status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(d_req_op_list); @@ -1753,9 +1748,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op // malloc sysOpInfoList in SysOpCheckResp status = rtMalloc(&d_res_op_list, op_nums * sizeof(SysOpInfo), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", - op_nums * sizeof(SysOpInfo), status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", op_nums * sizeof(SysOpInfo), status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%zu, ret = 0x%X", op_nums * sizeof(SysOpInfo), status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(d_res_op_list); @@ -1763,9 +1757,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op // malloc returnCodeList in SysOpCheckResp status = rtMalloc(&d_ret_code_list, op_nums * sizeof(ReturnCode), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", - op_nums * sizeof(ReturnCode), status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", op_nums * sizeof(ReturnCode), status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%zu, ret = 0x%X", op_nums * sizeof(ReturnCode), status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(d_ret_code_list); @@ -1776,9 +1769,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op void *d_op_type_name = nullptr; status = rtMalloc(&d_op_type_name, op_type.length(), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%lu, ret = 0x%X", - op_type.length(), status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%lu, ret = 0x%X", op_type.length(), status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%lu, ret = 0x%X", op_type.length(), status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(d_op_type_name); @@ -1795,9 +1787,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op void *d_op_type_name = nullptr; status = rtMalloc(&d_op_type_name, op_type.size(), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%lu, ret = 0x%X", - op_type.length(), status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%lu, ret = 0x%X", op_type.length(), status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%lu, ret = 0x%X", op_type.size(), status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(d_op_type_name); @@ -1825,9 +1816,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op uint32_t args_size = sizeof(SysOpCheckInfo) + sizeof(SysOpCheckResp); status = rtMalloc(&args, args_size, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", - args_size, status); - GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", args_size, status); + GELOGE(RT_FAILED, "[Call][RtMalloc] fail, size:%u, ret = 0x%X", args_size, status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(args); @@ -1842,9 +1832,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op status = rtStreamSynchronize(stream); if (status != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize fail, ret = 0x%X", - status); - GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status); + REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize fail, ret = 0x%X", status); + GELOGE(RT_FAILED, "[Call][RtStreamSynchronize] failed, ret:0x%X", status); GE_CHK_RT(rtStreamDestroy(stream)); return RT_ERROR_TO_GE_STATUS(status); } @@ -1879,7 +1868,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op REPORT_INNER_ERROR("E19999", "res_ret_code_list.size:%zu res_aicpu_op_info_list.size:%zu res_op_nums:%lu " "not equal, check invalid", res_ret_code_list.size(), res_aicpu_op_info_list.size(), res_op_nums); - GELOGE(FAILED, "Number of retcode is not equal to number of op type."); + GELOGE(FAILED, "[Check][Param] Number:%zu of retcode is not equal to number:%zu of op type or not equal %lu.", + res_ret_code_list.size(), res_aicpu_op_info_list.size(), res_op_nums); GE_CHK_RT(rtStreamDestroy(stream)); return FAILED; } @@ -1902,9 +1892,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op "<0: op_type, 1: format, 2: datatype> \n"; } fail_reason += "not support."; - REPORT_INNER_ERROR("E19999", "Check aicpu op_type failed, details:%s", - fail_reason.c_str()); - GELOGE(FAILED, "Check aicpu op_type failed. details: %s", fail_reason.c_str()); + REPORT_INNER_ERROR("E19999", "Check aicpu op_type failed, details:%s", fail_reason.c_str()); + GELOGE(FAILED, "[Check][Param] Check aicpu op_type failed. details:%s", fail_reason.c_str()); GE_CHK_RT(rtStreamDestroy(stream)); return FAILED; } @@ -1924,7 +1913,7 @@ Status ModelManager::CheckAicpuOpList(GeModelPtr ge_model) { return SUCCESS; } GE_CHK_STATUS_RET(LaunchKernelCheckAicpuOp(aicpu_optype_list, aicpu_tf_optype_list), - "Launch check aicpu op type failed."); + "[Call][LaunchKernelCheckAicpuOp] failed."); return SUCCESS; } diff --git a/ge/graph/load/model_manager/model_manager.h b/ge/graph/load/model_manager/model_manager.h index c0f14934..efba54ec 100755 --- a/ge/graph/load/model_manager/model_manager.h +++ b/ge/graph/load/model_manager/model_manager.h @@ -310,7 +310,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { std::lock_guard lock(exeception_infos_mutex_); auto instance = ModelManager::GetInstance(); if (instance == nullptr) { - GELOGE(FAILED, "Instance is nullptr"); + GELOGE(FAILED, "[Get][Instance] failed, as ret is nullptr"); return; } instance->AddExceptionInfo(*rt_exception_info); diff --git a/ge/graph/load/model_manager/model_utils.cc b/ge/graph/load/model_manager/model_utils.cc index f6ff591a..f593f67b 100755 --- a/ge/graph/load/model_manager/model_utils.cc +++ b/ge/graph/load/model_manager/model_utils.cc @@ -26,10 +26,10 @@ #define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \ do { \ if (SIZE <= static_cast(OFFSET)) { \ - REPORT_INNER_ERROR("E19999", \ - "Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \ + REPORT_INNER_ERROR("E19999", "Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \ OP->GetName().c_str(), OP->GetType().c_str(), OFFSET, SIZE); \ - GELOGE(OUT_OF_MEMORY, "Node: %s, memory out of range[%lu: %ld]", OP->GetName().c_str(), SIZE, OFFSET); \ + GELOGE(OUT_OF_MEMORY, "[Check][Param]Node: %s, memory out of range[%lu: %ld]", \ + OP->GetName().c_str(), SIZE, OFFSET); \ return {}; \ } \ } while (0) @@ -312,8 +312,9 @@ vector ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s), check invalid", ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, op_desc->GetName().c_str(), op_desc->GetType().c_str()); - GELOGE(PARAM_INVALID, "Fusion: check input size failed, op: %s, input v_memory_type size: %zu input numbers: %zu", - op_desc->GetName().c_str(), v_memory_type.size(), inputs_size); + GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s)", + ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return v_input_data_addr; } for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { @@ -392,8 +393,7 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc case RT_MEMORY_RDMA_HBM: if (offset < 0) { REPORT_INNER_ERROR("E19999", "Param offset:%ld < 0, check invalid", offset); - GELOGE(PARAM_INVALID, "rdma var addr is invalid, addr=%p", - reinterpret_cast(static_cast(offset))); + GELOGE(PARAM_INVALID, "[Check][Param] Param offset:%ld cannot be negative", offset); return PARAM_INVALID; } var_addr = reinterpret_cast(static_cast(offset)); @@ -403,9 +403,9 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc var_addr = model_param.var_base + offset - model_param.logic_var_base; break; default: - REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid", - mem_type, offset); - GELOGE(PARAM_INVALID, "unsupported memory type %u", mem_type); + REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid", mem_type, offset); + GELOGE(PARAM_INVALID, "[Check][Param] Get mem_type:%d for offset:%ld is unsupported, check invalid", + mem_type, offset); return PARAM_INVALID; } GE_CHECK_NOTNULL(var_addr); @@ -433,9 +433,9 @@ vector ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s), check invalid", ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, op_desc->GetName().c_str(), op_desc->GetType().c_str()); - GELOGE(PARAM_INVALID, - "Fusion: check output size failed, op: %s, output v_memory_type size: %lu output numbers: %zu", - op_desc->GetName().c_str(), v_memory_type.size(), outputs_size); + GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s)", + ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return v_output_data_addr; } for (size_t i = 0; i < outputs_size; ++i) { @@ -594,7 +594,7 @@ Status ModelUtils::GetRtAddress(const RuntimeParam ¶m, uintptr_t logic_addr, } else if (logic_addr != 0) { mem_addr = nullptr; REPORT_INNER_ERROR("E19999", "Check param logic addr:0x%lx abnormal", logic_addr); - GELOGE(PARAM_INVALID, "The logic addr:0x%lx is abnormal", logic_addr); + GELOGE(PARAM_INVALID, "[Check][Param] The logic addr:0x%lx is abnormal", logic_addr); return PARAM_INVALID; } diff --git a/ge/graph/load/model_manager/tbe_handle_store.cc b/ge/graph/load/model_manager/tbe_handle_store.cc index 6efb6190..36207aa2 100755 --- a/ge/graph/load/model_manager/tbe_handle_store.cc +++ b/ge/graph/load/model_manager/tbe_handle_store.cc @@ -24,7 +24,7 @@ namespace ge { void TbeHandleInfo::used_inc(uint32_t num) { if (used_ > std::numeric_limits::max() - num) { REPORT_INNER_ERROR("E19999", "Used:%u reach numeric max", used_); - GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric max.", used_); + GELOGE(INTERNAL_ERROR, "[Check][Param] Used[%u] reach numeric max.", used_); return; } @@ -34,7 +34,7 @@ void TbeHandleInfo::used_inc(uint32_t num) { void TbeHandleInfo::used_dec(uint32_t num) { if (used_ < std::numeric_limits::min() + num) { REPORT_INNER_ERROR("E19999", "Used:%u reach numeric min", used_); - GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric min.", used_); + GELOGE(INTERNAL_ERROR, "[Check][Param] Used[%u] reach numeric min.", used_); return; } @@ -107,9 +107,8 @@ void TBEHandleStore::ReferTBEHandle(const std::string &name) { std::lock_guard lock(mutex_); auto it = kernels_.find(name); if (it == kernels_.end()) { - REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", - name.c_str()); - GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", name.c_str()); + REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", name.c_str()); + GELOGE(INTERNAL_ERROR, "[Check][Param] Kernel[%s] not found in stored.", name.c_str()); return; } @@ -128,9 +127,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map &names for (auto &item : names) { auto it = kernels_.find(item.first); if (it == kernels_.end()) { - REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", - item.first.c_str()); - GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", item.first.c_str()); + REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", item.first.c_str()); + GELOGE(INTERNAL_ERROR, "[Check][Param] Kernel[%s] not found in stored.", item.first.c_str()); continue; } @@ -142,7 +140,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map &names if (rt_ret != RT_ERROR_NONE) { REPORT_INNER_ERROR("E19999", "Call rtDevBinaryUnRegister failed for Kernel:%s fail, ret:0x%X", item.first.c_str(), rt_ret); - GELOGE(INTERNAL_ERROR, "Kernel[%s] UnRegister handle fail:%u.", item.first.c_str(), rt_ret); + GELOGE(INTERNAL_ERROR, "[Call][RtDevBinaryUnRegister] Kernel[%s] UnRegister handle fail:%u.", + item.first.c_str(), rt_ret); } kernels_.erase(it); } diff --git a/ge/graph/load/model_manager/ts_mem_mall.h b/ge/graph/load/model_manager/ts_mem_mall.h index 74ce5a16..986b3101 100644 --- a/ge/graph/load/model_manager/ts_mem_mall.h +++ b/ge/graph/load/model_manager/ts_mem_mall.h @@ -43,7 +43,7 @@ class TsMemMall { for (auto it : mem_store_size_) { rtError_t ret = rtFree(it.second); if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret); + GELOGE(RT_FAILED, "[Call][RtFree] failed, ret:0x%X", ret); } } mem_store_size_.clear(); @@ -52,7 +52,7 @@ class TsMemMall { void *Acquire(int64_t offset, uint64_t size) { if (size == 0) { - GELOGE(RT_FAILED, "Acquire mem block failed, size: %lu", size); + GELOGE(RT_FAILED, "[Check][Param] Acquire mem block failed, size:%lu", size); return nullptr; } @@ -71,7 +71,7 @@ class TsMemMall { void *addr = nullptr; rtError_t rt_ret = rtMalloc(&addr, bytes, mem_type_); if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtMalloc] failed, size:%lu, ret:0x%X", bytes, rt_ret); return nullptr; } @@ -94,7 +94,7 @@ class TsMemMall { mem_store_addr_.erase(it); rtError_t ret = rtFree(addr); if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret); + GELOGE(RT_FAILED, "[Call][RtFree] failed, ret:0x%X", ret); } } diff --git a/ge/graph/load/model_manager/zero_copy_offset.cc b/ge/graph/load/model_manager/zero_copy_offset.cc index 9d6f4e4f..4a57a899 100644 --- a/ge/graph/load/model_manager/zero_copy_offset.cc +++ b/ge/graph/load/model_manager/zero_copy_offset.cc @@ -38,8 +38,13 @@ Status ZeroCopyOffset::InitInputDataInfo(int64_t output_size, void *virtual_addr op_name_ = op_desc->GetName(); (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); - GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID, - "basic_offset_size should be equal to relative_offset_size"); + GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), + REPORT_INNER_ERROR("E19999", "basic_offset_size:%zu not equal to relative_offset_size:%zu, " + "check invalid", zero_copy_basic_offset_.size(), + zero_copy_relative_offset_.size()); + return PARAM_INVALID, + "[Check][Param] basic_offset_size:%zu should be equal to relative_offset_size:%zu", + zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); GELOGD("[ZCPY] zero_copy_basic_offset size is %zu", zero_copy_basic_offset_.size()); int64_t virtual_addr_offset = op_desc->GetOutputOffset().at(kDataIndex); @@ -78,7 +83,8 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector &input_size_list if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, size) != GRAPH_SUCCESS) { REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu", op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); - GELOGE(FAILED, "GetTensorSizeInBytes failed!"); + GELOGE(FAILED, "[Get][InputTensorSize] in op:%s(%s) failed, input_index:%zu", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); return FAILED; } @@ -88,8 +94,13 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector &input_size_list op_name_ = op_desc->GetName(); (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); - GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID, - "basic_offset_size should be equal to relative_offset_size"); + GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), + REPORT_INNER_ERROR("E19999", "basic_offset_size:%zu not equal to relative_offset_size:%zu, " + "check invalid", + zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); + return PARAM_INVALID, + "[Check][Param] basic_offset_size:%zu should be equal to relative_offset_size:%zu", + zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); int64_t virtual_addr_offset = op_desc->GetInputOffset().at(idx); IsL2Fusion(zero_copy_basic_offset_, virtual_addr_offset, fusion_flag); @@ -194,7 +205,8 @@ void ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *ou for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) { auto args_addrs = outside_addrs_[out_count].find(outside_addr); if (args_addrs != outside_addrs_[out_count].end()) { - GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), "Input args invalid."); + GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), + "[Set][TaskArgsOffset] failed, Input args invalid, offset:%zu.", offset); void *args_val = static_cast(args) + offset; args_addrs->second.push_back(args_val); GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val, diff --git a/ge/graph/load/model_manager/zero_copy_task.cc b/ge/graph/load/model_manager/zero_copy_task.cc index c96dd8b7..4957f8ea 100755 --- a/ge/graph/load/model_manager/zero_copy_task.cc +++ b/ge/graph/load/model_manager/zero_copy_task.cc @@ -36,9 +36,9 @@ ZeroCopyTask::~ZeroCopyTask() { args_addr_ = nullptr; } */ Status ZeroCopyTask::SetTaskArgsOffset(uintptr_t addr, size_t offset) { if (offset + sizeof(uintptr_t) > args_size_) { - REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid", - offset, args_size_); - GELOGE(FAILED, "[ZCPY] %s set task args failed, args size: %zu, offset: %zu", name_.c_str(), args_size_, offset); + REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid", offset, args_size_); + GELOGE(FAILED, "[Check][Param] [ZCPY] %s set task args failed, args size:%zu, offset:%zu", + name_.c_str(), args_size_, offset); return FAILED; // unexpected error, need fix. } @@ -118,9 +118,8 @@ Status ZeroCopyTask::DistributeParam(bool async_mode, rtStream_t stream) { } if (rt_err != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret: 0x%X", - args_size_, rt_err); - GELOGE(RT_FAILED, "[ZCPY] %s distribute task param failed, error=0x%x", name_.c_str(), rt_err); + REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret:0x%X", args_size_, rt_err); + GELOGE(RT_FAILED, "[Distribute][TaskParam] for %s failed, error = 0x%x", name_.c_str(), rt_err); return RT_ERROR_TO_GE_STATUS(rt_err); } diff --git a/ge/graph/manager/graph_caching_allocator.cc b/ge/graph/manager/graph_caching_allocator.cc index 75aa5c01..cdb1d131 100644 --- a/ge/graph/manager/graph_caching_allocator.cc +++ b/ge/graph/manager/graph_caching_allocator.cc @@ -112,7 +112,7 @@ Status CachingAllocator::Initialize(uint32_t device_id) { auto bin_ptr = new (std::nothrow) BlockBin(BlockComparator); if (bin_ptr == nullptr) { REPORT_CALL_ERROR("E19999", "New BlockBin fail, device_id:%u", device_id); - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc BlockBin failed."); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Alloc][BlockBin] failed, device_id:%u", device_id); return ACL_ERROR_GE_MEMORY_ALLOCATION; } free_block_bins_[i] = bin_ptr; @@ -147,9 +147,8 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device ptr = block->ptr; } if (ptr == nullptr) { - REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u", - size, device_id); - GELOGE(FAILED, "Malloc failed device id = %u, size= %zu", device_id, size); + REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u", size, device_id); + GELOGE(FAILED, "[Check][Param] FindFreeBlock failed device id = %u, size= %zu", device_id, size); } return ptr; } @@ -157,18 +156,16 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) { GELOGI("Free device id = %u", device_id); if (ptr == nullptr) { - REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid", - device_id); - GELOGE(PARAM_INVALID, "Invalid memory pointer"); + REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid", device_id); + GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer, device_id:%u", device_id); return ge::PARAM_INVALID; } std::lock_guard lock(mutex_); auto it = allocated_blocks_.find(ptr); if (it == allocated_blocks_.end()) { - REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid", - device_id); - GELOGE(PARAM_INVALID, "Invalid memory pointer: %p", ptr); + REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid", device_id); + GELOGE(PARAM_INVALID, "[Check][Param] Param ptr not allocated before, device_id:%u", device_id); return ge::PARAM_INVALID; } Block *block = it->second; @@ -225,9 +222,8 @@ Block *CachingAllocator::FindFreeBlock(size_t size, uint8_t *org_ptr, uint32_t d Block key(device_id, size, org_ptr); BlockBin *bin = GetBlockBin(size); if (bin == nullptr) { - REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", - size, device_id); - GELOGE(ge::FAILED, "Get block bin failed size = %zu", size); + REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", size, device_id); + GELOGE(ge::FAILED, "[Get][BlockBin] failed, size:%zu, device_id:%u", size, device_id); return nullptr; } std::lock_guard lock(mutex_); @@ -258,9 +254,8 @@ Block *CachingAllocator::SplitBlock(Block *block, size_t size, BlockBin &bin, ui Block *remaining = block; Block *new_block = new (std::nothrow) Block(device_id, size, &bin, block->ptr); if (new_block == nullptr) { - REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", - size, device_id); - GELOGE(ge::FAILED, "Alloc block failed size = %zu", size); + REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", size, device_id); + GELOGE(ge::FAILED, "[Alloc][Block] failed, size:%zu, device_id:%u", size, device_id); return block; } new_block->prev = remaining->prev; @@ -285,7 +280,7 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) { size_t free_cached_memory_size = FreeCachedBlocks(); memory_addr = memory_allocator_->MallocMemory(purpose, memory_size, device_id); if (memory_addr == nullptr) { - GELOGE(ge::FAILED, "TryExtendCache failed, no enough memory for size = %zu, device_id = %u", memory_size, + GELOGE(ge::FAILED, "[Malloc][Memory] failed, no enough memory for size = %zu, device_id = %u", memory_size, device_id); return ge::FAILED; } @@ -304,16 +299,14 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) { Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id) { BlockBin *bin = GetBlockBin(size); if (bin == nullptr) { - REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", - size, device_id); - GELOGE(ge::FAILED, "Get block bin failed size = %zu", size); + REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", size, device_id); + GELOGE(ge::FAILED, "[Get][BlockBin] failed, size:%zu, device_id:%u", size, device_id); return ge::FAILED; } Block *block = new (std::nothrow) Block(device_id, size, bin, nullptr); if (block == nullptr) { - REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", - size, device_id); - GELOGE(ge::FAILED, "Alloc block failed size = %zu", size); + REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", size, device_id); + GELOGE(ge::FAILED, "[Alloc][Block] failed, size:%zu, device_id:%u", size, device_id); return ge::FAILED; } diff --git a/ge/graph/manager/graph_context.cc b/ge/graph/manager/graph_context.cc index 3a705ad9..6d202cef 100644 --- a/ge/graph/manager/graph_context.cc +++ b/ge/graph/manager/graph_context.cc @@ -33,7 +33,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) { if (compute_graph_ == nullptr) { std::shared_ptr graph = graph_node->GetGraph(); if (graph == nullptr) { - GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!"); + GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[Get][Graph] failed, compute_graph by graphNode is NULL!"); return; } @@ -45,7 +45,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) { Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { if (graph_node == nullptr) { REPORT_INNER_ERROR("E19999", "Param graph_node is nullptr, check invalid"); - GELOGE(GE_GRAPH_PARAM_NULLPTR, "graphNode is NULL!"); + GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] graphNode is NULL!"); return GE_GRAPH_PARAM_NULLPTR; } @@ -56,7 +56,7 @@ Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { std::shared_ptr graph = graph_node->GetGraph(); if (graph == nullptr) { REPORT_INNER_ERROR("E19999", "Param graph in graph_node is nullptr, check invalid"); - GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!"); + GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[Get][Graph] failed, compute_graph by graphNode is NULL!"); return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; } @@ -73,14 +73,15 @@ Status GraphContext::Finalize() const { return SUCCESS; } Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTensor &returned_tensor) { if (var_data_name.empty()) { REPORT_INNER_ERROR("E19999", "Param var_data_name is empty, check invalid"); - GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "Variable data name is empty!"); + GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "[Check][Param] Variable data name is empty!"); return GE_GRAPH_EMPTY_STRING_NAME; } if (GetVarNodeTensorTable().empty()) { REPORT_INNER_ERROR("E19999", "VarNodeTensorTable is empty, var_data_name:%s, check invalid", var_data_name.c_str()); - GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "VarNodeTensorTable is empty!"); + GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "[Check][Param] VarNodeTensorTable is empty, var_data_name:%s", + var_data_name.c_str()); return GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE; } for (auto &var_record : GetVarNodeTensorTable()) { @@ -88,9 +89,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso returned_tensor.SetTensorDesc(var_record.second.GetTensorDesc()); auto ret = returned_tensor.SetData(var_record.second.GetData()); if (ret != SUCCESS) { - REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s", - var_data_name.c_str()); - GELOGE(ret, "Set Tensor data failed!"); + REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s", var_data_name.c_str()); + GELOGE(ret, "[Set][Data] to Tensor failed, var_data_name:%s", var_data_name.c_str()); return ret; } @@ -100,7 +100,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso REPORT_INNER_ERROR("E19999", "VarRecord with data_name:%s does not exist, check invalid", var_data_name.c_str()); - GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "VarRecord with data_name %s does NOT exist!", var_data_name.c_str()); + GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "[Check][Param] VarRecord with data_name %s does NOT exist!", + var_data_name.c_str()); return GE_GRAPH_VARIABLE_DOES_NOT_EXIST; } diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc index 69c84f6f..dc1142d0 100755 --- a/ge/graph/manager/graph_manager.cc +++ b/ge/graph/manager/graph_manager.cc @@ -149,7 +149,8 @@ ge::Status CheckFpCeilingMode() { if (ret == ge::GRAPH_SUCCESS) { if (kValidFpCeilingMode.count(mode) == 0) { REPORT_INNER_ERROR("E19999", "Option ge.fpCeilingMode is invalid, value:%s", mode.c_str()); - GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "The fp_ceiling_mode %s is invalid, options are 0, 1, and 2.", mode.c_str()); + GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Get][Option] The fp_ceiling_mode %s is invalid, options are 0, 1, and 2.", + mode.c_str()); return ge::GE_GRAPH_OPTIONS_INVALID; } GELOGI("The parameter fp_ceiling_mode is set to %s.", mode.c_str()); @@ -178,33 +179,33 @@ Status GraphManager::Initialize(const std::map &options) { graph_run_listener_ = MakeShared(sync_run_mutex_, condition_); if (graph_run_listener_ == nullptr) { REPORT_CALL_ERROR("E19999", "New GraphModelListener fail"); - GELOGE(MEMALLOC_FAILED, "Make shared failed"); + GELOGE(MEMALLOC_FAILED, "[New][GraphModelListener] failed"); return MEMALLOC_FAILED; } // graph context graph_context_ = MakeShared(); if (graph_context_ == nullptr) { - REPORT_CALL_ERROR("E19999", "New GraphModelListener fail"); - GELOGE(MEMALLOC_FAILED, "Make shared failed."); + REPORT_CALL_ERROR("E19999", "New GraphContext fail"); + GELOGE(MEMALLOC_FAILED, "[New][GraphContext] failed."); return MEMALLOC_FAILED; } // parse option parameters Status ret = ParseOptions(options); if (ret != SUCCESS) { - GELOGE(ret, "[Initialize] parse options failed."); + GELOGE(ret, "[Parse][Options] failed."); return ret; } ret = CheckFpCeilingMode(); if (ret != SUCCESS) { - GELOGE(ret, "[Initialize] Check fp-ceiling-mode options failed."); + GELOGE(ret, "[Check][FpCeilingMode] failed."); return ret; } ret = graph_context_->Initialize(options); if (ret != SUCCESS) { - GELOGE(ret, "[Initialize] GraphContext initialize failed."); + GELOGE(ret, "[Initialize][GraphContext] failed."); return ret; } @@ -302,7 +303,7 @@ Status GraphManager::Finalize() { if (graph_context_ != nullptr) { Status ret_final = graph_context_->Finalize(); if (ret_final != SUCCESS) { - GELOGE(ret_final, "[GraphManager] graph context Finalize failed!"); + GELOGE(ret_final, "[Finalize][GraphContext] failed!"); unload_model_ret = ret_final; } } @@ -321,9 +322,8 @@ Status GraphManager::InitDynamicParams(ComputeGraphPtr &compute_graph) { std::string op_type; auto ret = GetOriginalType(node, op_type); if (ret != SUCCESS) { - REPORT_CALL_ERROR("E19999", "GetOriginalType from op:%s fail", - node->GetName().c_str()); - GELOGE(FAILED, "Failed to get node %s original type.", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetOriginalType from op:%s fail", node->GetName().c_str()); + GELOGE(FAILED, "[Get][OriginalType] from op:%s failed.", node->GetName().c_str()); return FAILED; } if ((op_desc->GetType() == DATA) || (op_type == kGetNextName)) { @@ -335,7 +335,7 @@ Status GraphManager::InitDynamicParams(ComputeGraphPtr &compute_graph) { if (!options_.input_shape.empty() && !options_.dynamic_dims.empty()) { if (!ge::ParseInputShape(options_.input_shape, GetLocalOmgContext().input_dims, GetLocalOmgContext().user_input_dims, true)) { - GELOGE(GRAPH_PARAM_INVALID, "Failed to parse input shape: %s.", options_.input_shape.c_str()); + GELOGE(GRAPH_PARAM_INVALID, "[Parse][InputShape] %s failed.", options_.input_shape.c_str()); return GRAPH_PARAM_INVALID; } GetLocalOmgContext().dynamic_dims = options_.dynamic_dims; @@ -380,7 +380,7 @@ void GraphManager::RemoveAddGraphCondition(GraphId graph_id) { Status GraphManager::CheckRepeatAdd(uint32_t graph_id, bool &is_added) { uint32_t count = 0; if (GetGraphCount(graph_id, count) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed, graph might have not been added.", graph_id); + GELOGE(INTERNAL_ERROR, "[Get][GraphCount] failed, graph[id:%u] might have not been added.", graph_id); return INTERNAL_ERROR; } // previous thread owns same graph_id has been in the middle of the AddGraph procession @@ -393,7 +393,7 @@ Status GraphManager::CheckRepeatAdd(uint32_t graph_id, bool &is_added) { GraphNodePtr graph_node; Status ret = GetGraphNode(graph_id, graph_node); if (ret != SUCCESS) { - GELOGE(ret, "[AddGraph] GetGraphNode failed, graph_id = %u.", graph_id); + GELOGE(ret, "[Get][GraphNode] failed, graph_id = %u.", graph_id); return ret; } is_added = true; @@ -418,7 +418,7 @@ void GraphManager::SetSessionGraphId(ComputeGraphPtr compute_graph, uint32_t gra Status GraphManager::NotifyWaittingGraph(uint32_t graph_id) { uint32_t count = 0; if (GetGraphCount(graph_id, count) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed, graph might have not been added.", graph_id); + GELOGE(INTERNAL_ERROR, "[Get][GraphCount] failed, graph[id:%u] might have not been added.", graph_id); return INTERNAL_ERROR; } GELOGD("Add graph finished, graph_id:%u", graph_id); @@ -433,15 +433,13 @@ Status GraphManager::CreateGraphNode(uint32_t graph_id, const Graph &graph, const std::map &options) { GraphNodePtr graph_node = MakeShared(graph_id); GE_IF_BOOL_EXEC(graph_node == nullptr, - REPORT_CALL_ERROR("E19999", "New GraphNode fail, graph_id:%u", - graph_id); - GELOGE(FAILED, "GraphNode make shared failed"); + REPORT_CALL_ERROR("E19999", "New GraphNode fail, graph_id:%u", graph_id); + GELOGE(FAILED, "[New][GraphNode] fail, graph_id:%u", graph_id); return FAILED); std::shared_ptr graph_ptr = MakeShared(graph); GE_IF_BOOL_EXEC(graph_ptr == nullptr, - REPORT_CALL_ERROR("E19999", "New Graph fail, graph_id:%u", - graph_id); - GELOGE(FAILED, "GraphPtr make shared failed"); + REPORT_CALL_ERROR("E19999", "New Graph fail, graph_id:%u", graph_id); + GELOGE(FAILED, "[New][Graph] fail, graph_id:%u", graph_id); return FAILED); // update option about tuning graph ParseOption(options, BUILD_MODE, options_.build_mode); @@ -459,7 +457,7 @@ Status GraphManager::SetStagesOptions(uint32_t graph_id, const GraphManagerOptio stages.preparer.SetOptions(options_); Status status = stages.optimizer.SetOptions(options_); if (status != SUCCESS) { - GELOGE(status, "Graph optimizer set options failed."); + GELOGE(status, "[Set][Options] for Graph optimizer failed, graph id:%u.", graph_id); return status; } stages.builder.SetOptions(options_); @@ -517,7 +515,8 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, if (GetAddGraphCondition(graph_id) == kDoneAdded) { GraphNodePtr graph_node; if (GetGraphNode(graph_id, graph_node) != SUCCESS) { - GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "Graph not exist while done adding previously, graph_id = %u.", graph_id); + GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[Get][GraphNode] failed, Graph not exist while done adding previously, " + "graph_id = %u.", graph_id); return GE_GRAPH_GRAPH_NOT_EXIST; } graph_node->IncreaseLoadCount(); @@ -528,7 +527,7 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, // done adding graph of the former graph, avoiding repeatively adding same graph. bool is_added = false; if (CheckRepeatAdd(graph_id, is_added) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "CheckRepeatAdd for graph[id:%u] failed.", graph_id); + GELOGE(INTERNAL_ERROR, "[Check][RepeatAdd] for graph[id:%u] failed.", graph_id); return INTERNAL_ERROR; } // The former graph (from different thread) owns same graph id has been successfully added. @@ -538,7 +537,7 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, // Do add graph SetAddGraphCondition(graph_id, kStartAdd); if (CheckGraphAdded(graph_id, graph) != SUCCESS) { - GELOGE(FAILED, "AddGraph failed."); + GELOGE(FAILED, "[Check][GraphAdded] failed, graph id:%u.", graph_id); return FAILED; } GE_CHK_STATUS_RET(ModifyDataIndex(graph, options)); @@ -548,7 +547,7 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, SetSessionGraphId(compute_graph, graph_id); if (CreateGraphNode(graph_id, graph, options) != SUCCESS) { - GELOGE(FAILED, "Failed to create graph_node."); + GELOGE(FAILED, "[Create][GraphNode] failed, graph id:%u.", graph_id); return FAILED; } @@ -557,12 +556,12 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, GetLocalOmgContext().output_type = options_.output_datatype; } if (InitDynamicParams(compute_graph) != SUCCESS) { - GELOGE(GRAPH_PARAM_INVALID, "Failed to init params when online infer is dynamic."); + GELOGE(GRAPH_PARAM_INVALID, "[Init][Params] failed, when online infer is dynamic, graph id:%u.", graph_id); return GRAPH_PARAM_INVALID; } if (SetStagesOptions(graph_id, options_) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Set stage options failed."); + GELOGE(INTERNAL_ERROR, "[Set][StagesOptions] failed, graph id:%u.", graph_id); return INTERNAL_ERROR; } @@ -570,7 +569,7 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, SetAddGraphCondition(graph_id, kDoneAdded); // There are threads waitting for adding same graph if (NotifyWaittingGraph(graph_id) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "NotifyWaittingGraph failed."); + GELOGE(INTERNAL_ERROR, "[Notify][WaittingGraph] failed, graph id:%u.", graph_id); return INTERNAL_ERROR; } return SUCCESS; @@ -585,14 +584,13 @@ Status GraphManager::CheckGraphAdded(const GraphId &graph_id, const Graph &graph && graph_has_been_added) { REPORT_INNER_ERROR("E19999", "Get Attr:%s from graph:%u fail.", ATTR_NAME_GRAPH_HAS_BEEN_ADDED.c_str(), graph_id); - GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, - "[GraphManager] same graph object can not be added again, graph_id = %u.", graph_id); + GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[Get][Attr] %s from graph:%u fail.", + ATTR_NAME_GRAPH_HAS_BEEN_ADDED.c_str(), graph_id); return GE_GRAPH_GRAPH_ALREADY_EXIST; } } else { - REPORT_INNER_ERROR("E19999", "compute_graph from graph:%u is nullptr, check invalid", - graph_id); - GELOGE(FAILED, "compute graph is null"); + REPORT_INNER_ERROR("E19999", "compute_graph from graph:%u is nullptr, check invalid", graph_id); + GELOGE(FAILED, "[Get][ComputeGraph] failed, compute graph from graph:%u is nullptr", graph_id); return FAILED; } return SUCCESS; @@ -602,11 +600,11 @@ Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &grap const std::map &options, const OmgContext &omg_context) { if (HasGraphNode(graph_id)) { - GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] graph exists, graph_id = %u", graph_id); + GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[Has][GraphNode] graph exists, graph_id = %u", graph_id); return GE_GRAPH_GRAPH_ALREADY_EXIST; } if (CheckGraphAdded(graph_id, graph) != SUCCESS) { - GELOGE(FAILED, "AddGraphWithCopy failed."); + GELOGE(FAILED, "[Check][GraphAdded] failed, graph_id = %u", graph_id); return FAILED; } IncreaseGraphCount(graph_id); @@ -620,7 +618,7 @@ Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &grap SetSessionGraphId(new_compute_graph, graph_id); std::shared_ptr new_graph_ptr = GraphUtils::CreateGraphPtrFromComputeGraph(new_compute_graph); if (CreateGraphNode(graph_id, *new_graph_ptr, options) != SUCCESS) { - GELOGE(FAILED, "Failed to create graph_node."); + GELOGE(FAILED, "[Create][GraphNode] failed, graph_id = %u", graph_id); return FAILED; } @@ -629,12 +627,12 @@ Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &grap GetLocalOmgContext().output_type = options_.output_datatype; } if (InitDynamicParams(new_compute_graph) != SUCCESS) { - GELOGE(GRAPH_PARAM_INVALID, "Failed to init params when online infer is dynamic."); + GELOGE(GRAPH_PARAM_INVALID, "[Init][Params] failed, when online infer is dynamic, graph_id = %u", graph_id); return GRAPH_PARAM_INVALID; } if (SetStagesOptions(graph_id, options_) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Set stage options failed."); + GELOGE(INTERNAL_ERROR, "[Set][StagesOptions] failed, graph_id = %u", graph_id); return INTERNAL_ERROR; } @@ -655,9 +653,9 @@ Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::Com Status ret_topo = compute_graph->TopologicalSorting(); if (ret_topo != SUCCESS) { - REPORT_CALL_ERROR("E19999", "TopologicalSorting fail, graph_id:%u", - compute_graph->GetGraphID()); - GELOGE(ret_topo, "[GraphManager]: TopologicalSorting the merged graph failed."); + REPORT_CALL_ERROR("E19999", "TopologicalSorting fail, graph_id:%u", compute_graph->GetGraphID()); + GELOGE(ret_topo, "[Call][TopologicalSorting] for the merged graph failed, graph_id:%u", + compute_graph->GetGraphID()); return ret_topo; } } else { @@ -692,16 +690,16 @@ Status GraphManager::CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_gr std::vector output_nodes; ComputeGraphPtr new_compute_graph = GraphUtils::CloneGraph(old_compute_graph, "", input_nodes, output_nodes); if (new_compute_graph == nullptr) { - REPORT_CALL_ERROR("E19999", "CloneGraph fail, graph_id:%u", - compute_graph->GetGraphID()); - GELOGE(INTERNAL_ERROR, "Clone graph failed."); + REPORT_CALL_ERROR("E19999", "CloneGraph fail, graph_id:%u", compute_graph->GetGraphID()); + GELOGE(INTERNAL_ERROR, "[Clone][Graph] failed, graph_id:%u", compute_graph->GetGraphID()); return INTERNAL_ERROR; } copy_graphs.emplace(old_compute_graph->GetName(), new_compute_graph); if (!AttrUtils::SetBool(old_compute_graph, ATTR_NAME_NEED_LX_FUSION, true)) { REPORT_INNER_ERROR("E19999", "Set Attr:%s to graph:%u fail", ATTR_NAME_NEED_LX_FUSION.c_str(), old_compute_graph->GetGraphID()); - GELOGE(INTERNAL_ERROR, "Set attr lx_fusion to graph failed."); + GELOGE(INTERNAL_ERROR, "[Set][Attr] %s to graph:%u failed.", + ATTR_NAME_NEED_LX_FUSION.c_str(), old_compute_graph->GetGraphID()); return INTERNAL_ERROR; } } @@ -739,7 +737,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr ErrorManager::GetInstance().GetErrorManagerContext(), GetThreadLocalContext()); if (!f.valid()) { - GELOGE(FAILED, "Future is invalid"); + GELOGE(FAILED, "[Call][Commit] failed, Future is invalid, session_id:%lu", session_id); return FAILED; } vector_future.emplace_back(std::move(f)); @@ -756,7 +754,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr ErrorManager::GetInstance().GetErrorManagerContext(), GetThreadLocalContext()); if (!f.valid()) { - GELOGE(FAILED, "Future is invalid"); + GELOGE(FAILED, "[Call][Commit] failed, Future is invalid, session_id:%lu", session_id); return FAILED; } vector_future.emplace_back(std::move(f)); @@ -767,7 +765,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr Status ret_status = vector_future[i].get(); if (ret_status != SUCCESS) { REPORT_CALL_ERROR("E19999", "subgraph %zu optimize failed", i); - GELOGE(ret_status, "subgraph %zu optimize failed", i); + GELOGE(ret_status, "[Check][Param] subgraph %zu optimize failed", i); return ret_status; } } @@ -778,7 +776,7 @@ bool GraphManager::CheckAllFusionOptimizeSuccess(const ComputeGraphPtr &compute_ Graph2SubGraphInfoList &sub_graph_map) { if (compute_graph == nullptr) { REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); - GELOGE(PARAM_INVALID, "Input param compute_graph is nullptr."); + GELOGE(PARAM_INVALID, "[Check][Param] Input param compute_graph is nullptr."); return false; } @@ -819,7 +817,8 @@ Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_ if (iter == copy_graphs.end()) { REPORT_INNER_ERROR("E19999", "Can not find subgraph:%s in copy graphs, check invalid", subgraph->GetSubGraph()->GetName().c_str()); - GELOGE(FAILED, "Can not find subgraph:%s in copy graphs.", subgraph->GetSubGraph()->GetName().c_str()); + GELOGE(FAILED, "[Check][Param] Can not find subgraph:%s in copy graphs.", + subgraph->GetSubGraph()->GetName().c_str()); return FAILED; } subgraph->SetSubGraph(iter->second); @@ -832,7 +831,8 @@ Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_ if (iter == copy_graphs.end()) { REPORT_INNER_ERROR("E19999", "Can not find subgraph:%s in copy graphs, check invalid", subgraph->GetSubGraph()->GetName().c_str()); - GELOGE(FAILED, "Can not find subgraph:%s in copy graphs.", subgraph->GetSubGraph()->GetName().c_str()); + GELOGE(FAILED, "[Check][Param] Can not find subgraph:%s in copy graphs.", + subgraph->GetSubGraph()->GetName().c_str()); return FAILED; } subgraph->SetSubGraph(iter->second); @@ -850,7 +850,7 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr options_.build_step.c_str()); Status ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id); if (ret != SUCCESS) { - GELOGE(ret, "Multiply optimize subgraph failed"); + GELOGE(ret, "[Call][OptimizeSubGraphWithMultiThreads] failed, ret:%d, session_id:%lu", ret, session_id); return ret; } return SUCCESS; @@ -892,7 +892,8 @@ Status GraphManager::PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node, GE_CHK_STATUS_RET(graph_pass.AddPass("PreRun::CtrlEdgeTransferPass", new (std::nothrow) CtrlEdgeTransferPass)) GE_CHK_STATUS_RET(graph_pass.Run(compute_graph)); - GE_CHK_STATUS_RET(stages.optimizer.IdentifyReference(compute_graph), "Identify reference failed."); + GE_CHK_STATUS_RET(stages.optimizer.IdentifyReference(compute_graph), + "[Identify][Reference] failed, graph:%s.", compute_graph->GetName().c_str()); GELOGD("PreRun:PreRunOptimizeOriginalGraph success."); return SUCCESS; } @@ -930,9 +931,8 @@ Status GraphManager::PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node, Status ret = compute_graph->TopologicalSorting(); if (ret != SUCCESS) { - REPORT_CALL_ERROR("E19999", "TopologicalSorting fail, graph_id:%u", - compute_graph->GetGraphID()); - GELOGE(ret, "Graph topological sort failed, ret:%d.", ret); + REPORT_CALL_ERROR("E19999", "TopologicalSorting fail, graph_id:%u", compute_graph->GetGraphID()); + GELOGE(ret, "[Call][TopologicalSorting] fail, graph_id:%u", compute_graph->GetGraphID()); return ret; } @@ -949,14 +949,14 @@ Status GraphManager::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode, uint if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtCtxCreate faileded, session_id:%lu, graph_id:%u, mode:%d", session_id, graph_id, mode); - GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + GELOGE(FAILED, "[Call][RtCtxCreate] faileded, session_id:%lu, graph_id:%u, mode:%d", session_id, graph_id, mode); return FAILED; } rt_ret = rtCtxSetCurrent(rt_context); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, mode:%d", session_id, graph_id, mode); - GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + GELOGE(FAILED, "[Call][RtCtxSetCurrent] failed, session_id:%lu, graph_id:%u, mode:%d", session_id, graph_id, mode); return FAILED; } RtContextUtil::GetInstance().AddRtContext(session_id, graph_id, rt_context); @@ -970,7 +970,7 @@ Status GraphManager::RunCustomPass(const GraphNodePtr &graph_node) { GE_TIMESTAMP_START(RunCustomPass); GraphPtr graph = std::const_pointer_cast(const_graph); - GE_CHK_STATUS_RET(CustomPassHelper::Instance().Run(graph), "Graph[%s] run custom pass fail.", + GE_CHK_STATUS_RET(CustomPassHelper::Instance().Run(graph), "[Call][Run] for Graph[%s] fail.", comp_graph->GetName().c_str()); GE_TIMESTAMP_END(RunCustomPass, "GraphBuilder::RunCustomPass"); return SUCCESS; @@ -986,7 +986,7 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorSetSessionID(session_id); auto analyzer_instance = Analyzer::GetInstance(); GE_CHK_STATUS_RET(analyzer_instance->BuildJsonObject(session_id, compute_graph->GetGraphID()), - "BuildJsonObject Failed") + "[Build][JsonObject] Failed, session_id:%lu", session_id) GEEVENT("PreRun start: graph node size %zu, session id %lu, graph id %u, graph name %s.", compute_graph->GetDirectNodesSize(), session_id, compute_graph->GetGraphID(), @@ -995,7 +995,7 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetGraphID()); if (ret != SUCCESS) { - GELOGE(ret, "Set rt context failed."); + GELOGE(ret, "[Set][RtContext] failed, session_id:%lu, graph_id:%u.", session_id, compute_graph->GetGraphID()); return ret; } @@ -1009,17 +1009,20 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetName().c_str()); + GELOGE(ret, "[Run][PreRunOptimizeOriginalGraph] failed for graph:%s, session_id:%lu", + compute_graph->GetName().c_str(), session_id); return ret; } } ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kSubGraphOptimize); // set fuzz compile flag after origin graph optimize - GE_CHK_STATUS_RET(SetFuzzCompileFlag(compute_graph), "Set fuzz compile flag failed."); + GE_CHK_STATUS_RET(SetFuzzCompileFlag(compute_graph), + "[Set][FuzzCompileFlag] failed for graph:%s.", compute_graph->GetName().c_str()); ret = PreRunOptimizeSubGraph(graph_node, compute_graph, session_id); if (ret != SUCCESS) { - GELOGE(ret, "Run PreRunOptimizeSubGraph failed for graph:%s.", compute_graph->GetName().c_str()); + GELOGE(ret, "[Run][PreRunOptimizeSubGraph] failed for graph:%s, session_id:%lu.", + compute_graph->GetName().c_str(), session_id); return ret; } @@ -1033,7 +1036,8 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetName().c_str()); + GELOGE(ret, "[Run][PreRunAfterOptimizeSubGraph] failed for graph:%s, session_id:%lu.", + compute_graph->GetName().c_str(), session_id); return ret; } } @@ -1058,7 +1062,7 @@ Status GraphManager::SetFuzzCompileFlag(ComputeGraphPtr &compute_graph) { GE_CHECK_NOTNULL(op_desc); GELOGD("Fuzz compile flag is %d.", GetLocalOmgContext().fuzz_compile_flag); if (!AttrUtils::SetBool(op_desc, ATTR_NAME_FUZZ_BUILD, GetLocalOmgContext().fuzz_compile_flag)) { - GELOGE(FAILED, "[Set][ATTR_NAME_FUZZ_BUILD]Failed to set fuzz build attr to %s.", op_desc->GetName().c_str()); + GELOGE(FAILED, "[Set][ATTR] %s to %s failed.", ATTR_NAME_FUZZ_BUILD.c_str(), op_desc->GetName().c_str()); return FAILED; } } @@ -1074,7 +1078,7 @@ Status GraphManager::SubexpressionMigration(ComputeGraphPtr &compute_graph) { auto ret = pass_manager.Run(compute_graph); GE_TIMESTAMP_END(SubexpressionMigrationPass, "GraphManager::SubexpressionMigration"); if (ret != SUCCESS && ret != NOT_CHANGED) { - GELOGE(ret, "Run SubexpressionMigrationPass failed, ret:%u.", ret); + GELOGE(ret, "[Run][SubexpressionMigrationPass] failed, ret:%u.", ret); return ret; } @@ -1091,7 +1095,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: REPORT_INNER_ERROR("E19999", "Graph:%u has not build before, can't run directly, " "check invalid", graph_node->GetGraphId()); GELOGE(PARAM_INVALID, - "The graph %u need to re-build, you should remove it from GE " + "[Get][BuildFlag] The graph %u need to re-build, you should remove it from GE " "first, then AddGraph again and rebuild it.", graph_node->GetGraphId()); return PARAM_INVALID; @@ -1104,7 +1108,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: // release rts generate context RtContextUtil::GetInstance().DestroyRtContexts(session_id, graph_node->GetGraphId()); if (ret != SUCCESS) { - GELOGE(ret, "PreRun Failed, graph_id:%u.", graph_node->GetGraphId()); + GELOGE(ret, "[Call][PreRun] Failed, graph_id:%u, session_id:%lu.", graph_node->GetGraphId(), session_id); return ret; } } @@ -1115,7 +1119,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: ret = LoadGraphAsync(ge_root_model, graph_node); } if (ret != SUCCESS) { - GELOGE(ret, "LoadGraph Failed."); + GELOGE(ret, "[Load][Graph] Failed, graph_id:%u.", graph_node->GetGraphId()); return ret; } graph_node->SetBuildFlag(true); @@ -1129,7 +1133,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: ret = LoadGraphAsync(ge_root_model_ptr, graph_node); } if (ret != SUCCESS) { - GELOGE(ret, "LoadGraph Failed."); + GELOGE(ret, "[Load][Graph] Failed, graph_id:%u.", graph_node->GetGraphId()); return ret; } } @@ -1160,7 +1164,7 @@ Status GraphManager::LoadGraph(const GeRootModelPtr &ge_root_model, const GraphN Status ret = GraphLoader::LoadModelOnline(model_id_info.model_id, ge_root_model, model_listener); GE_TIMESTAMP_EVENT_END(LoadGraph, "GraphManager::LoadGraph"); if (ret != SUCCESS) { - GELOGE(ret, "[StartForRunGraph] LoadGraph Failed"); + GELOGE(ret, "[Load][Model] failed, ret:%d", ret); graph_node->SetRunFlag(false); return ret; } @@ -1260,7 +1264,7 @@ Status GraphManager::InnerRunGraph(GraphNodePtr &graph_node, const GraphId &grap const std::vector &inputs, std::vector &outputs) { Status ret = graph_executor_.SetCondition(&sync_run_mutex_, &condition_, graph_run_listener_); if (ret != SUCCESS) { - GELOGE(GE_GRAPH_RUNGRAPH_FAILED, "[RunGraph] set condition failed, graph_id = %u.", graph_id); + GELOGE(GE_GRAPH_RUNGRAPH_FAILED, "[Set][Condition] failed, graph_id = %u.", graph_id); graph_node->SetRunFlag(false); return GE_GRAPH_RUNGRAPH_FAILED; } @@ -1273,7 +1277,7 @@ Status GraphManager::InnerRunGraph(GraphNodePtr &graph_node, const GraphId &grap graph_node->SetRunFlag(false); if (ret != SUCCESS) { - GELOGE(ret, "[RunGraph] execute graph failed, graph_id = %u.", graph_id); + GELOGE(ret, "[Execute][Graph] failed, graph_id = %u.", graph_id); return ret; } return SUCCESS; @@ -1283,8 +1287,7 @@ Status GraphManager::InnerRunGraphWithStream(GraphNodePtr &graph_node, const Gra const std::vector &inputs, std::vector &outputs) { auto ret = graph_executor_.SetCondition(&sync_run_mutex_, &condition_, graph_run_listener_); if (ret != SUCCESS) { - GELOGE(GE_GRAPH_RUNGRAPH_FAILED, "[Run][GraphWithStreamAsync] set condition failed, " - "graph id = %u, stream = %p.", graph_id, stream); + GELOGE(GE_GRAPH_RUNGRAPH_FAILED, "[Set][Condition] failed, graph id = %u, stream = %p.", graph_id, stream); graph_node->SetRunFlag(false); return GE_GRAPH_RUNGRAPH_FAILED; } @@ -1293,7 +1296,7 @@ Status GraphManager::InnerRunGraphWithStream(GraphNodePtr &graph_node, const Gra graph_node->SetRunFlag(false); graph_node->SetIsSpecificStream(false); if (ret != SUCCESS) { - GELOGE(ret, "[Run][GraphWithStreamAsync] execute graph failed, graph id = %u, stream = %p.", graph_id, stream); + GELOGE(ret, "[Execute][Graph] With Stream failed, graph id = %u, stream = %p.", graph_id, stream); return ret; } GELOGI("[Run][GraphWithStreamAsync] run graph success, graph id = %u, stream = %p.", graph_id, stream); @@ -1315,18 +1318,20 @@ Status GraphManager::RunGraphWithStreamAsync(const GraphId &graph_id, rtStream_t Status ret = GetGraphNode(graph_id, graph_node); if (ret != SUCCESS) { REPORT_INNER_ERROR("E19999", "graph id = %u not exist in graph_map, check invalid.", graph_id); - GELOGE(ret, "Run graph with stream async graph not exist, graph id = %u.", graph_id); + GELOGE(ret, "[Get][GraphNode] failed, Run graph with stream async, graph not exist, graph id = %u.", graph_id); return ret; } if (graph_node == nullptr) { REPORT_INNER_ERROR("E19999", "Graph node is nullptr in graph_map, graph id = %u, check invalid.", graph_id); - GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "Run graph with stream async graph node is NULL, graph id = %u.", graph_id); + GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[Check][Param] Run graph with stream async, graph node is NULL, " + "graph id = %u.", graph_id); return GE_GRAPH_GRAPH_NODE_NULL; } if (graph_node->GetRunFlag()) { REPORT_INNER_ERROR("E19999", "Graph is already running, can't be run again, graph id = %u, " "check invalid.", graph_id); - GELOGE(GE_GRAPH_ALREADY_RUNNING, "Run graph with stream async graph already running, graph id = %u.", graph_id); + GELOGE(GE_GRAPH_ALREADY_RUNNING, "[Get][RunFlag] Run graph with stream async graph already running, " + "graph id = %u.", graph_id); return GE_GRAPH_ALREADY_RUNNING; } @@ -1344,7 +1349,7 @@ Status GraphManager::RunGraphWithStreamAsync(const GraphId &graph_id, rtStream_t GeRootModelPtr ge_root_model = nullptr; ret = StartForRunGraph(graph_node, inputs, ge_root_model, session_id); if (ret != SUCCESS) { - GELOGE(ret, "[Run][GraphWithStreamAsync] StartForRunGraph failed!"); + GELOGE(ret, "[Call][StartForRunGraph] failed, session_id:%lu", session_id); graph_node->SetRunFlag(false); return ret; } @@ -1365,23 +1370,20 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vectorGetRunFlag()) { - REPORT_INNER_ERROR("E19999", "Graph is already running, can't be run again, graph_id:%u, " - "check invalid", graph_id); - GELOGE(GE_GRAPH_ALREADY_RUNNING, "[RunGraph] graph already running, graph id = %u", graph_id); + REPORT_INNER_ERROR("E19999", "Graph is already running, can't be run again, graph_id:%u, check invalid", graph_id); + GELOGE(GE_GRAPH_ALREADY_RUNNING, "[Get][RunFlag] graph already running, graph id = %u", graph_id); return GE_GRAPH_ALREADY_RUNNING; } @@ -1395,8 +1397,8 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vectorSetRunFlag(false); return ret; } @@ -1425,7 +1427,7 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vectorIsSummaryGraph()) { ret = SummaryHandle(graph_id, outputs); if (ret != SUCCESS) { - GELOGE(ret, "[RunGraph] SummaryHandle failed!"); + GELOGE(ret, "[Call][SummaryHandle] failed, graph_id:%u", graph_id); } } @@ -1436,7 +1438,7 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vectorInitFlag()) { - REPORT_INNER_ERROR("E19999", "GELib is not init before, graph_id:%u, check invalid", - graph_id); - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized"); + REPORT_INNER_ERROR("E19999", "GELib is not init before, graph_id:%u, check invalid", graph_id); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Get][GELib] GELib is not init before, graph_id:%u", graph_id); return GE_CLI_GE_NOT_INITIALIZED; } @@ -1524,7 +1523,9 @@ Status GraphManager::BuildGraphForUnregisteredOp(const GraphId &graph_id, const REPORT_INNER_ERROR("E19999", "GetOpsKernelInfoStore fail for op:%s(%s), kernel_lib_name:%s, graph_id:%u, " "check invalid", op_desc->GetName().c_str(), op_desc->GetType().c_str(), op_desc->GetOpKernelLibName().c_str(), graph_id); - GELOGE(FAILED, "Get op kernel info store failed"); + GELOGE(FAILED, "[Get][OpsKernelInfoStore] fail for op:%s(%s), kernel_lib_name:%s, graph_id:%u", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + op_desc->GetOpKernelLibName().c_str(), graph_id); return FAILED; } @@ -1533,8 +1534,7 @@ Status GraphManager::BuildGraphForUnregisteredOp(const GraphId &graph_id, const REPORT_CALL_ERROR("E19999", "Call CompileOp fail for op:%s(%s), kernel_lib_name:%s, graph_id:%u, " "check invalid", op_desc->GetName().c_str(), op_desc->GetType().c_str(), op_desc->GetOpKernelLibName().c_str(), graph_id); - GELOGE(FAILED, "Get op kernel info store failed"); - GELOGE(ret, "Compile op failed, op = %s, graph_id = %u.", op_desc->GetName().c_str(), graph_id); + GELOGE(ret, "[Compile][Op] failed, op = %s, graph_id = %u.", op_desc->GetName().c_str(), graph_id); return ret; } } @@ -1557,23 +1557,21 @@ Status GraphManager::BuildGraph(const GraphId &graph_id, const std::vectorGetRunFlag()) { REPORT_INNER_ERROR("E19999", "Graph is already running, can't be run again, graph_id:%u, " "check invalid", graph_id); - GELOGE(GE_GRAPH_ALREADY_RUNNING, "[BuildGraph] graph already running, graph id = %u", graph_node->GetGraphId()); + GELOGE(GE_GRAPH_ALREADY_RUNNING, "[Get][RunFlag] graph already running, graph id = %u", graph_node->GetGraphId()); return GE_GRAPH_ALREADY_RUNNING; } @@ -1586,7 +1584,7 @@ Status GraphManager::BuildGraph(const GraphId &graph_id, const std::vectorSetRunFlag(false); if (ret != SUCCESS) { - GELOGE(GE_GRAPH_PRERUN_FAILED, "[BuildGraph] StartForRunGraph failed! graph_id:%u.", graph_id); + GELOGE(GE_GRAPH_PRERUN_FAILED, "[Call][StartForRunGraph] failed! graph_id:%u.", graph_id); return GE_GRAPH_PRERUN_FAILED; } @@ -1606,18 +1604,18 @@ Status GraphManager::BuildGraph(const GraphId &graph_id, const std::vector &attrs, const std::vector &inputs, const std::vector &outputs) { - GE_CHK_BOOL_EXEC(ge::AttrUtils::SetStr(&model, "ATTR_MODEL_OP_TYPE", type), return FAILED, "Set Op[%s] type fail", - type.c_str()); + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetStr(&model, "ATTR_MODEL_OP_TYPE", type), return FAILED, + "[Set][Str] model type[%s] fail", type.c_str()); for (const auto &it : attrs) { GE_CHK_BOOL_EXEC(model.SetAttr("ATTR_MODEL_" + it.first, it.second) == GRAPH_SUCCESS, return FAILED, - "Set OpDesc attribute[%s] fail", it.first.c_str()); + "[Set][Attr] OpDesc attribute[%s] fail", it.first.c_str()); } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListTensor(&model, "ATTR_MODEL_TENSOR_INPUTS", inputs), return FAILED, - "Set Inputs tensor list fail"); + "[Set][InputsTensor] list fail"); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListTensor(&model, "ATTR_MODEL_TENSOR_OUTPUTS", outputs), return FAILED, - "Set Outputs tensor list fail"); + "[Set][OutputsTensor] list fail"); return SUCCESS; } @@ -1644,9 +1642,8 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) { GraphNodePtr graph_node = nullptr; Status ret = GetGraphNode(graph_id, graph_node); if (ret != SUCCESS || graph_node == nullptr) { - REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid when GraphManager %s", - graph_id, __FUNCTION__); - GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[GraphManager] Id %u does not exists.", graph_id); + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", graph_id); + GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[Get][GraphNode] Id %u does not exists.", graph_id); return GE_GRAPH_GRAPH_NOT_EXIST; } if (graph_node->GetRunFlag()) { @@ -1671,7 +1668,7 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) { if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, graph_id:%u", GetContext().DeviceId(), graph_id); - GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), + GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), graph_id); return FAILED; } @@ -1679,16 +1676,15 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) { // unload them respectively. middle_ret = UnloadModel(ge_root_model, graph_id); if (middle_ret != SUCCESS) { - REPORT_INNER_ERROR("E19999", "UnloadModel for graph:%u failed, check unload detail in GraphLoader %s", - graph_id, __FUNCTION__); - GELOGE(middle_ret, "[GraphManager:] unload model failed, graph_id=%u.", graph_id); + REPORT_INNER_ERROR("E19999", "UnloadModel for graph:%u failed, check invalid", graph_id); + GELOGE(middle_ret, "[Unload][Model] model failed, graph_id=%u.", graph_id); ret = middle_ret; } rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, graph_id:%u, when GraphManager %s", - GetContext().DeviceId(), graph_id, __FUNCTION__); - GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, graphId=%u.", graph_id); + REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, graph_id:%u", + GetContext().DeviceId(), graph_id); + GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, graph_id:%u", GetContext().DeviceId(), graph_id); ret = FAILED; } } @@ -1697,7 +1693,7 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) { RemoveGraphCount(graph_id); RemoveAddGraphCondition(graph_id); - GE_CHK_STATUS_RET(ret, "[GraphManager:] Remove graph failed, graph_id=%u.", graph_id); + GE_CHK_STATUS_RET(ret, "[Remove][Graph] failed, graph_id=%u.", graph_id); GELOGI("[GraphManager] remove graph success, graph_id=%u.", graph_id); return SUCCESS; } @@ -1710,8 +1706,7 @@ Status GraphManager::ParseOptions(const std::map &opti ret = ParseOption(options, STREAM_MAX_PARALLEL_NUM, options_.stream_max_parallel_num); if (ret != SUCCESS) { GELOGE(GE_GRAPH_OPTIONS_INVALID, - "parse Key:%s value failed, it must be same format as " - "DNN_V100:2,DNN_HCCL:3", + "[Parse][Option] %s value failed, it must be same format as DNN_V100:2,DNN_HCCL:3", STREAM_MAX_PARALLEL_NUM.c_str()); return GE_GRAPH_OPTIONS_INVALID; } @@ -1719,23 +1714,23 @@ Status GraphManager::ParseOptions(const std::map &opti // get stream num ret = ParseOption(options, STREAM_NUM, options_.stream_num); if ((ret != SUCCESS) || (options_.stream_num == 0)) { - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.stream_num, its value %d is invalid, must be not equal zero.", - options_.stream_num); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][Option] Key:ge.stream_num, its value %d is invalid, " + "must be not equal zero.", options_.stream_num); return GE_GRAPH_OPTIONS_INVALID; } // get perf level, its value please see enum PerfLevel ret = ParseOption(options, PERF_LEVEL, options_.perf_level); if ((ret != SUCCESS) || IsPerfLevelInvalid(options_.perf_level)) { - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.perfLevel, its value %d is invalid, must be enum PerfLevel type.", - options_.perf_level); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][Option] Key:ge.perfLevel, its value %d is invalid, " + "must be enum PerfLevel type.", options_.perf_level); return GE_GRAPH_OPTIONS_INVALID; } // get encrypt mode ret = ParseOption(options, ENCRYPT_MODE, options_.encrypt_mode); GE_IF_BOOL_EXEC(ret != SUCCESS, - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.encryptMode value invalid."); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][Option] Key:ge.encryptMode value invalid."); return GE_GRAPH_OPTIONS_INVALID); // get ek file @@ -1775,7 +1770,8 @@ Status GraphManager::ParseOptions(const std::map &opti // get weight compress flag ret = ParseOption(options, COMPRESS_FLAG, options_.compress_flag); GE_IF_BOOL_EXEC(ret != SUCCESS, - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.compressFlag value is invalid, must be 0 or 1."); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][Option] Key:ge.compressFlag value is invalid, " + "must be 0 or 1."); return GE_GRAPH_OPTIONS_INVALID); // Set Build model and step ParseOption(options, BUILD_MODE, options_.build_mode); @@ -1786,21 +1782,22 @@ Status GraphManager::ParseOptions(const std::map &opti options_.run_graph_flag = true; ret = ParseOption(options, RUN_FLAG, options_.run_graph_flag); GE_IF_BOOL_EXEC(ret != SUCCESS, - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.runFlag value is invalid, must be 0 or 1."); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][Option] Key:ge.runFlag value is invalid, must be 0 or 1."); return GE_GRAPH_OPTIONS_INVALID); // ge.graphType ret = ParseTrainGraphFlag(options_.run_graph_flag, options_.train_graph_flag); GE_IF_BOOL_EXEC(ret != SUCCESS, - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.runFlag value is invalid"); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][TrainGraphFlag] Key:ge.runFlag value is invalid"); return GE_GRAPH_OPTIONS_INVALID); // parse FmkOp options_.local_fmk_op_flag = false; ret = ParseOption(options, LOCAL_FMKOP_FLAG, options_.local_fmk_op_flag); GE_IF_BOOL_EXEC(ret != SUCCESS, - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.localFmkopFlag value is invalid, must be 0 or 1."); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][Option] Key:ge.localFmkopFlag value is invalid, " + "must be 0 or 1."); return GE_GRAPH_OPTIONS_INVALID); options_.enable_print_op_pass = true; ret = ParseOption(options, ENABLE_PRINT_OP_PASS, options_.enable_print_op_pass); @@ -1808,13 +1805,15 @@ Status GraphManager::ParseOptions(const std::map &opti options_.is_single_op = false; ret = ParseOption(options, SINGLE_OP_FLAG, options_.is_single_op); GE_IF_BOOL_EXEC(ret != SUCCESS, - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.enablePrintOpPass value is invalid, must be 0 or 1."); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][Option] Key:ge.enablePrintOpPass value is invalid, " + "must be 0 or 1."); return GE_GRAPH_OPTIONS_INVALID); // parse hcom parallel options_.hcom_parallel = false; ret = ParseOption(options, HCOM_PARALLEL, options_.hcom_parallel); GE_IF_BOOL_EXEC(ret != SUCCESS, - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.hcomParallel value is invalid, must be 0 or 1."); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][Option] Key:ge.hcomParallel value is invalid, " + "must be 0 or 1."); return GE_GRAPH_OPTIONS_INVALID); // net output node dataType ParseOption(options, OUTPUT_DATATYPE, options_.output_datatype); @@ -1874,10 +1873,9 @@ Status GraphManager::ParseOption(const std::map &optio } else if (flag == "1") { option = true; } else { - REPORT_INNER_ERROR("E19999", "Option:%s value:%s must be 0 or 1, check invalid", - key.c_str(), flag.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:%s, its value %s is invalid, it must be 0 or 1.", key.c_str(), - flag.c_str()); + REPORT_INNER_ERROR("E19999", "Option:%s value:%s must be 0 or 1, check invalid", key.c_str(), flag.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] Key:%s, its value %s is invalid, it must be 0 or 1.", + key.c_str(), flag.c_str()); return GE_GRAPH_OPTIONS_INVALID; } } @@ -1894,8 +1892,8 @@ Status GraphManager::ParseOption(const std::map &optio if (ptr != nullptr && *ptr != '\0') { REPORT_INNER_ERROR("E19999", "Option:%s value:%s must be int32_t type, check invalid", key.c_str(), iter->second.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:%s, its value %s is invalid, must be int32_t type.", key.c_str(), - iter->second.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] Key:%s, its value %s is invalid, must be int32_t type.", + key.c_str(), iter->second.c_str()); return GE_GRAPH_OPTIONS_INVALID; } } @@ -1939,10 +1937,8 @@ Status GraphManager::ParseOption(const std::map &optio if (pos == string::npos) { REPORT_INNER_ERROR("E19999", "Option:%s, value:%s, engine and num must be connected by :, check invalid", key.c_str(), engine_parallel.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, - "engine and num must be connected by :, " - "while your input is %s", - engine_parallel.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] engine and num must be connected by :, " + "while your input is %s", engine_parallel.c_str()); return GE_GRAPH_OPTIONS_INVALID; } std::string engine_name = engine_parallel.substr(0, pos); @@ -1952,14 +1948,14 @@ Status GraphManager::ParseOption(const std::map &optio Status ret = CheckEngineName(engine_name, key, option); if (ret != SUCCESS) { - GELOGE(GE_GRAPH_OPTIONS_INVALID, "check engine name : %s failed, ", engine_name.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][EngineName] %s failed", engine_name.c_str()); return GE_GRAPH_OPTIONS_INVALID; } int num = 0; ret = ParseParallelNum(parallel_num, key, num); if (ret != SUCCESS) { - GELOGE(GE_GRAPH_OPTIONS_INVALID, "parse parallel num failed"); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][ParallelNum] %s failed", parallel_num.c_str()); return GE_GRAPH_OPTIONS_INVALID; } @@ -1974,7 +1970,7 @@ Status GraphManager::CheckEngineName(const std::string &engine_name, const std:: if (engine_name.empty()) { REPORT_INNER_ERROR("E19999", "Option:%s, param engine_name:%s is empty, check invalid", key.c_str(), engine_name.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "engine name of %s is empty", key.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] engine name of %s is empty", key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } // judge whether exist in engine list @@ -1986,7 +1982,7 @@ Status GraphManager::CheckEngineName(const std::string &engine_name, const std:: if (it_stream_repeat != option.end()) { REPORT_INNER_ERROR("E19999", "Option:%s, param engine_name:%s is repeated, check invalid", key.c_str(), engine_name.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "engine : %s of %s is repeated", engine_name.c_str(), key.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] engine:%s of %s is repeated", engine_name.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } return SUCCESS; @@ -1996,14 +1992,15 @@ Status GraphManager::ParseParallelNum(const std::string ¶llel_num, const std if (parallel_num.empty()) { REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is empty, check invalid", key.c_str(), parallel_num.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num of %s is empty", key.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] parallel num of %s is empty", key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } for (char c : parallel_num) { if (!isdigit(c)) { REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is not digit, check invalid", key.c_str(), parallel_num.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "%s input is invalid ", key.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] Option:%s, param parallel num:%s is not digit, check invalid", + key.c_str(), parallel_num.c_str()); return GE_GRAPH_OPTIONS_INVALID; } } @@ -2013,24 +2010,28 @@ Status GraphManager::ParseParallelNum(const std::string ¶llel_num, const std } catch (std::invalid_argument &) { REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is invalid argument, check", key.c_str(), parallel_num.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num : %s of %s is invalid argument", parallel_num.c_str(), key.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] parallel num:%s of %s is invalid argument", + parallel_num.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } catch (std::out_of_range &) { REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is out of range, check", key.c_str(), parallel_num.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num : %s of %s is out of range", parallel_num.c_str(), key.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] parallel num:%s of %s is out of range", + parallel_num.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } catch (...) { REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is invalid argument, check", key.c_str(), parallel_num.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num : %s of %s is invalid argument", parallel_num.c_str(), key.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] parallel num:%s of %s is invalid argument", + parallel_num.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } if (num < 1) { REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s < 1, check invalid", key.c_str(), parallel_num.c_str()); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num : %s of %s must bigger than 0", parallel_num.c_str(), key.c_str()); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] parallel num:%s of %s must bigger than 0", + parallel_num.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } return SUCCESS; @@ -2057,9 +2058,8 @@ Status GraphManager::GetGraphNode(const GraphId &graph_id, GraphNodePtr &out) { auto iter = graph_map_.find(graph_id); if (iter == graph_map_.end()) { out = nullptr; - REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", - graph_id); - GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[GraphManager] graph not exist, graph_id= %u.", graph_id); + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", graph_id); + GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[Check][Param] graph not exist, graph_id= %u.", graph_id); return GE_GRAPH_GRAPH_NOT_EXIST; } out = iter->second; @@ -2080,7 +2080,7 @@ Status GraphManager::SummaryHandle(const GraphId &graph_id, std::vector &summary_output_indexes = whole_summary_output_indexes.at(graph_id); @@ -2125,9 +2125,8 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap } } if (netoutput == nullptr) { - REPORT_INNER_ERROR("E19999", "No netoutput node in graph:%u, check invalid", - graph_id); - GELOGE(FAILED, "Netoutput is null."); + REPORT_INNER_ERROR("E19999", "No netoutput node in graph:%u, check invalid", graph_id); + GELOGE(FAILED, "[Check][Param] No netoutput node in graph:%u", graph_id); return FAILED; } for (const auto &in : netoutput->GetAllInDataAnchors()) { @@ -2135,9 +2134,9 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap auto out_anchor = in->GetPeerOutAnchor(); if (out_anchor == nullptr) { REPORT_INNER_ERROR("E19999", "Peer anchor of op:%s(%s), in_index:%u is nullptr, graph_id:%u, check invalid", - netoutput->GetName().c_str(), netoutput->GetType().c_str(), - in->GetIdx(), graph_id); - GELOGE(FAILED, "out_anchor is null."); + netoutput->GetName().c_str(), netoutput->GetType().c_str(), in->GetIdx(), graph_id); + GELOGE(FAILED, "[Get][PeerOutAnchor] Peer anchor of op:%s(%s), in_index:%u is nullptr, graph_id:%u", + netoutput->GetName().c_str(), netoutput->GetType().c_str(), in->GetIdx(), graph_id); return FAILED; } ge::NodePtr peer_node = out_anchor->GetOwnerNode(); @@ -2146,7 +2145,8 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap if (peer_node->GetAllInDataAnchors().size() != 1) { REPORT_INNER_ERROR("E19999", "More than one prior nodes of peer_node:%s(%s) in checkpoint Graph:%u, " "check invalid", peer_node->GetName().c_str(), peer_node->GetType().c_str(), graph_id); - GELOGE(FAILED, "More than one prior nodes of peer_node %s in checkpoint Graph.", peer_node->GetName().c_str()); + GELOGE(FAILED, "[Check][Param] More than one prior nodes of peer_node:%s(%s) in checkpoint Graph:%u.", + peer_node->GetName().c_str(), peer_node->GetType().c_str(), graph_id); return FAILED; } auto peer_node_in = peer_node->GetAllInDataAnchors().at(0); @@ -2160,9 +2160,9 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap } if (peer_node == nullptr) { REPORT_INNER_ERROR("E19999", "Peer anchor node of op:%s(%s), in_index:%u is nullptr, graph_id:%u, check invalid", - netoutput->GetName().c_str(), netoutput->GetType().c_str(), - in->GetIdx(), graph_id); - GELOGE(FAILED, "No variable op found in one branch, checkpoint graph illegal."); + netoutput->GetName().c_str(), netoutput->GetType().c_str(), in->GetIdx(), graph_id); + GELOGE(FAILED, "[Check][Param] Peer anchor node of op:%s(%s), in_index:%u is nullptr, graph_id:%u", + netoutput->GetName().c_str(), netoutput->GetType().c_str(), in->GetIdx(), graph_id); return FAILED; } desc_name = peer_node->GetName(); @@ -2171,7 +2171,8 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap REPORT_INNER_ERROR("E19999", "in index:%u of op:%s(%s) is out of outputs.size:%zu range, graph_id:%u, " "check invalid", in->GetIdx(), netoutput->GetName().c_str(), netoutput->GetType().c_str(), outputs.size(), graph_id); - GELOGE(FAILED, "variable index out of range."); + GELOGE(FAILED, "[Check][Param] in index:%u of op:%s(%s) is out of outputs.size:%zu range, graph_id:%u", + in->GetIdx(), netoutput->GetName().c_str(), netoutput->GetType().c_str(), outputs.size(), graph_id); return FAILED; } save_results.emplace(desc_name, TensorAdapter::AsTensor(outputs.at(in->GetIdx()))); @@ -2217,9 +2218,8 @@ Status GraphManager::PushSummaryData2ME(const GraphId &graph_id, } return iter->second(graph_id, tmp_summary_data); } - REPORT_INNER_ERROR("E19999", "No summary callback found, graph_id:%u, check invalid", - graph_id); - GELOGE(FAILED, "[GraphManager] PushSummaryData2ME failed, not found summary callback."); + REPORT_INNER_ERROR("E19999", "No summary callback found, graph_id:%u, check invalid", graph_id); + GELOGE(FAILED, "[Check][Param] No summary callback found, graph_id:%u", graph_id); return FAILED; } return itr->second(graph_id, summary_data); @@ -2239,9 +2239,8 @@ Status GraphManager::PushSaveData2ME(const GraphId &graph_id, const std::mapsecond(graph_id, tmp_save_data); } - REPORT_INNER_ERROR("E19999", "No checkpoint callback found, graph_id:%u, check invalid", - graph_id); - GELOGE(FAILED, "[GraphManager] PushSaveData2ME failed, not found checkpoint callback."); + REPORT_INNER_ERROR("E19999", "No checkpoint callback found, graph_id:%u, check invalid", graph_id); + GELOGE(FAILED, "[Check][Param] No checkpoint callback found, graph_id:%u", graph_id); return FAILED; } return itr->second(graph_id, save_data); @@ -2271,7 +2270,8 @@ bool GraphManager::CheckVariableForCheckpointGraph(NodePtr &node) { if (out == nullptr) { REPORT_INNER_ERROR("E19999", "anchor index:0 of op:%s(%s) is nullptr, check invalid", node->GetName().c_str(), node->GetType().c_str()); - GELOGE(GE_GRAPH_PARAM_NULLPTR, "out is nullptr."); + GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Get][OutDataAnchor] anchor index:0 of op:%s(%s) is nullptr", + node->GetName().c_str(), node->GetType().c_str()); return false; } auto peer_out = out->GetPeerInDataAnchors(); @@ -2304,7 +2304,7 @@ static inline bool CheckConstanOpForCheckpointGraph(NodePtr &node) { return node bool GraphManager::IsCheckpointGraph(ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); - GELOGE(GE_GRAPH_PARAM_NULLPTR, "[IsCheckpointGraph] computeGraph is nullptr."); + GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] computeGraph is nullptr."); return false; } for (auto &node : compute_graph->GetAllNodes()) { @@ -2438,9 +2438,8 @@ Status GraphManager::RemoveIsolatedConstInThisGraph(ge::ComputeGraphPtr &compute if (n->GetOutAllNodes().empty() && n->GetInAllNodes().empty()) { // it is an isolated constant, just remove it if (GraphUtils::RemoveJustNode(compute_graph, n) != GRAPH_SUCCESS) { - REPORT_CALL_ERROR("E19999", "Remove constant op:%s(%s) failed", - n->GetName().c_str(), n->GetType().c_str()); - GELOGE(FAILED, "remove constant %s failed.", n->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Remove constant op:%s(%s) failed", n->GetName().c_str(), n->GetType().c_str()); + GELOGE(FAILED, "[Call][RemoveJustNode] remove constant %s failed.", n->GetName().c_str()); return FAILED; } } @@ -2519,7 +2518,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { auto ret = after_merge_passes.Run(compute_graph); GE_TIMESTAMP_END(after_merge_passes, "GraphManager::OptimizeStage1_1"); if (ret != SUCCESS && ret != NOT_CHANGED) { - GELOGE(ret, "Run passes when OptimizeStage1_1 failed, ret:%u.", ret); + GELOGE(ret, "[Run][Passes] when OptimizeStage1_1 failed, ret:%u.", ret); return ret; } @@ -2558,7 +2557,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { ret = GEPass(compute_graph).Run(names_to_passes); GE_TIMESTAMP_END(names_to_passes, "GraphManager::OptimizeStage1_2"); if (ret != SUCCESS) { - GELOGE(ret, "Run passes when OptimizeStage1_2 failed, ret:%u.", ret); + GELOGE(ret, "[Run][Passes] when OptimizeStage1_2 failed, ret:%u.", ret); return ret; } // Calculate Op/Fe constantfolding cost @@ -2614,7 +2613,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { ret = graph_pass.Run(compute_graph); GE_TIMESTAMP_END(graph_pass, "GraphManager::OptimizeStage1_3"); if (ret != SUCCESS && ret != NOT_CHANGED) { - GELOGE(ret, "Run passes when OptimizeStage1_3 failed, ret:%u.", ret); + GELOGE(ret, "[Run][Passes] when OptimizeStage1_3 failed, ret:%u.", ret); return ret; } NamesToPass node_pass; @@ -2624,7 +2623,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { ret = GEPass(compute_graph).Run(node_pass); GE_TIMESTAMP_END(node_pass, "GraphPrepare::node_pass"); if (ret != SUCCESS) { - GELOGE(ret, "Run identity remove pass for preprocess failed, ret:%u.", ret); + GELOGE(ret, "[Run][Identity] remove pass for preprocess failed, ret:%u.", ret); return ret; } return SUCCESS; @@ -2644,7 +2643,7 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) { auto ret = after_merge_passes.Run(compute_graph); GE_TIMESTAMP_END(after_merge_passes, "OptimizeStage2::AfterMergePasses"); if (ret != SUCCESS && ret != NOT_CHANGED) { - GELOGE(ret, "Run passes after merge sub graph failed, ret:%d.", ret); + GELOGE(ret, "[Run][Passes] after merge sub graph failed, ret:%d.", ret); return ret; } SetAttrForHcomBroadCastOp(compute_graph); @@ -2668,13 +2667,13 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) { ret = GEPass(compute_graph).Run(names_to_passes); GE_TIMESTAMP_END(names_to_passes, "OptimizeStage2::MergedGraphNameToPasses"); if (ret != SUCCESS) { - GELOGE(ret, "Run ge_passes optimize for OptimizeAfterMergeSubGraph failed, ret:%d.", ret); + GELOGE(ret, "[Run][GEPasses] optimize for OptimizeAfterMergeSubGraph failed, ret:%d.", ret); return ret; } ret = RemoveIsolatedConst(compute_graph); if (ret != SUCCESS) { - GELOGE(ret, "Remove isolated Constant failed, ret:%d.", ret); + GELOGE(ret, "[Remove][IsolatedConst] failed, ret:%d.", ret); return ret; } @@ -2725,32 +2724,33 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) { ret = pass_for_control_attr_optimize.Run(compute_graph); GE_TIMESTAMP_END(pass_for_control_attr_optimize, "OptimizeStage2::ControlAttrOptimize"); if (ret != SUCCESS && ret != NOT_CHANGED) { - GELOGE(ret, "Run passes when optimize stage 2 failed"); + GELOGE(ret, "[Run][Passes] when optimize stage 2 failed"); return ret; } // Assign functional op labels. GE_TIMESTAMP_START(AssignFunctionalLabels); LabelAllocator label_allocator(compute_graph); - GE_CHK_STATUS_RET(label_allocator.AssignFunctionalLabels(), "Assign label failed."); + GE_CHK_STATUS_RET(label_allocator.AssignFunctionalLabels(), "[Assign][Label] failed."); GE_TIMESTAMP_END(AssignFunctionalLabels, "ModelBuilder::AssignFunctionalLabels"); // Add memcpy addr asynchronous node. GE_TIMESTAMP_START(AddMemcpyAddrAsyncNode); MemcpyAddrAsyncPass memcpy_addr; - GE_CHK_STATUS_RET(memcpy_addr.Run(compute_graph), "Add memcpy_addr_async node failed."); + GE_CHK_STATUS_RET(memcpy_addr.Run(compute_graph), "[Call][Run] Add memcpy_addr_async node failed."); GE_TIMESTAMP_END(AddMemcpyAddrAsyncNode, "MemcpyAddrAsyncPass::Run."); // Process offset and dependency for buffer pool memory assigner. GE_TIMESTAMP_START(BufferPoolMemoryPass); BufferPoolMemoryPass buffer_pool_mem_pass; - GE_CHK_STATUS_RET(buffer_pool_mem_pass.Run(compute_graph), "Failed to process for buffer pool allocator."); + GE_CHK_STATUS_RET(buffer_pool_mem_pass.Run(compute_graph), + "[Call][Run] Failed to process for buffer pool allocator."); GE_TIMESTAMP_END(BufferPoolMemoryPass, "BufferPoolMemoryPass::Run."); // Handle parallel group . GE_TIMESTAMP_START(ParallelGroup); ParallelGroupPass parallel_group_pass; - GE_CHK_STATUS_RET(parallel_group_pass.Run(compute_graph), "Handle parallel group failed."); + GE_CHK_STATUS_RET(parallel_group_pass.Run(compute_graph), "[Handle][ParallelGroup] failed."); GE_TIMESTAMP_END(ParallelGroup, "ParallelGroupPass::Run."); // After while sub graph handle, mark all node rw type @@ -2803,7 +2803,7 @@ Status GraphManager::LoadGraphAsync(const GeRootModelPtr &ge_root_model, const G Status ret = GraphLoader::LoadModelOnline(model_id_info.model_id, ge_root_model, listener); GE_TIMESTAMP_EVENT_END(LoadGraph, "GraphManager::LoadGraphAsync"); if (ret != SUCCESS) { - GELOGE(ret, "[LoadGraphAsync] LoadGraphAsync Failed"); + GELOGE(ret, "[Load][ModelOnline] Failed, model_id:%u", model_id_info.model_id); graph_node->SetRunFlag(false); return ret; } @@ -2818,9 +2818,8 @@ void GraphManager::ReleaseMemory(const GeModelPtr &ge_model, GraphNodePtr &graph const std::vector &model_ids, uint32_t graph_id, uint64_t session_id) { rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, when GraphManager %s", - GetContext().DeviceId(), __FUNCTION__); - GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, graphId=%u.", graph_id); + REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u", GetContext().DeviceId()); + GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, device_id=%u.", GetContext().DeviceId()); return; } for (auto model_id : model_ids) { @@ -2862,9 +2861,8 @@ void GraphManager::ReleaseMemory(const GeModelPtr &ge_model, GraphNodePtr &graph ge_root_model->ClearAllModelId(); rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, when GraphManager %s", - GetContext().DeviceId(), __FUNCTION__); - GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, graphId=%u.", graph_id); + REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u", GetContext().DeviceId()); + GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u.", GetContext().DeviceId()); return; } } @@ -2892,7 +2890,8 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra if (ge::CheckInt64AddOverflow(memory_size, weight_size) != SUCCESS) { REPORT_INNER_ERROR("E19999", "memory_size:%ld and weight_size:%ld will overflow after add, check invalid", memory_size, weight_size); - GELOGE(INTERNAL_ERROR, "The sum of Memory size and weight size exceeds INT64_MAX"); + GELOGE(INTERNAL_ERROR, "[Check][Param] memory_size:%ld and weight_size:%ld will overflow after add", + memory_size, weight_size); return INTERNAL_ERROR; } if (free_memory >= (memory_size + weight_size)) { @@ -2952,23 +2951,25 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager GE_DUMP(compute_graph_tmp, "OptimizeSubGraphBefore"); GE_CHECK_NOTNULL(compute_graph_tmp); if (!AttrUtils::SetInt(*compute_graph_tmp, ATTR_NAME_ROOT_GRAPH_ID, root_graph_id)) { - REPORT_CALL_ERROR("E19999", "Set Attr:%s to graph:%u", ATTR_NAME_ROOT_GRAPH_ID.c_str(), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to graph:%u failed", ATTR_NAME_ROOT_GRAPH_ID.c_str(), compute_graph_tmp->GetGraphID()); - GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_ID for subgraph, graph_id: %u.", root_graph_id); + GELOGE(FAILED, "[Set][Attr] %s to graph:%u failed", ATTR_NAME_ROOT_GRAPH_ID.c_str(), + compute_graph_tmp->GetGraphID()); return FAILED; } if (!AttrUtils::SetStr(*compute_graph_tmp, ATTR_NAME_ROOT_GRAPH_NAME, root_graph_name)) { - REPORT_CALL_ERROR("E19999", "Set Attr:%s to graph:%u", ATTR_NAME_ROOT_GRAPH_NAME.c_str(), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to graph:%u failed", ATTR_NAME_ROOT_GRAPH_NAME.c_str(), compute_graph_tmp->GetGraphID()); - GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_NAME for subgraph, \ - root_graph_name: %s.", root_graph_name.c_str()); + GELOGE(FAILED, "[Set][Attr] %s to graph:%u failed", ATTR_NAME_ROOT_GRAPH_NAME.c_str(), + compute_graph_tmp->GetGraphID()); return FAILED; } compute_graph_tmp->SetSessionID(session_id); Status ret = graph_manager->GetCompilerStages(root_graph_id).optimizer.OptimizeSubGraph(compute_graph_tmp, engine_name); if (ret != SUCCESS) { - GELOGE(ret, "SubGraph optimize Failed %s", engine_name.c_str()); + GELOGE(ret, "[Optimize][SubGraph] Failed, engine:%s, graph:%s", + engine_name.c_str(), compute_graph_tmp->GetName().c_str()); return ret; } else { GELOGD("SubGraph optimize success %s", engine_name.c_str()); @@ -2980,7 +2981,7 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager pthread_self()); } else { REPORT_INNER_ERROR("E19999", "Param sub_graph_info_ptr or graph_manager is nullptr"); - GELOGE(FAILED, "graph_manager or sub_graph_info_ptr is nullptr"); + GELOGE(FAILED, "[Check][Param] graph_manager or sub_graph_info_ptr is nullptr"); return FAILED; } @@ -2997,7 +2998,7 @@ Status GraphManager::RunGraphAsync(const GraphId &graph_id, const std::vectorGetGraphCount(args.graph_id, count) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed.", args.graph_id); + GELOGE(INTERNAL_ERROR, "[Get][GraphCount] failed, graph id:%u.", args.graph_id); return; } // Avoid repeatively prerun for graphs owns same graph_id in online inference concurrency @@ -3215,7 +3216,8 @@ Status GraphManager::ParseInputsDimsForGetNexNosinkAndData(const vector if (!(AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, index))) { REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) fail", ATTR_NAME_INDEX.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); - GELOGE(PARAM_INVALID, "Get index from attr failed"); + GELOGE(PARAM_INVALID, "[Get][Attr] %s from op:%s(%s) fail", ATTR_NAME_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } if (static_cast(index) > input_tensor.size()) { @@ -3223,7 +3225,9 @@ Status GraphManager::ParseInputsDimsForGetNexNosinkAndData(const vector "check invalid", ATTR_NAME_INDEX.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), index, input_tensor.size()); - GELOGE(PARAM_INVALID, "The count of input tensor should be equal to the count of data."); + GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in op:%s(%s) value:%ld > param input_tensor.size:%zu", + ATTR_NAME_INDEX.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), + index, input_tensor.size()); return PARAM_INVALID; } @@ -3253,7 +3257,7 @@ Status GraphManager::ParseInputsDims(const std::vector &input_tensor } else { // data+getnext_nosink, but only need to get shape_dims of data if (ParseInputsDimsForGetNexNosinkAndData(data_nodes, input_tensor) != SUCCESS) { - GELOGE(PARAM_INVALID, "Failed to parse dims from data, when data coexist with getnext nosink."); + GELOGE(PARAM_INVALID, "[Parse][Dims] from data failed, when data coexist with getnext nosink."); return PARAM_INVALID; } } @@ -3269,7 +3273,7 @@ Status GraphManager::ParseInputsDims(const std::vector &input_tensor } else { // getnext_nosink + data, but only need to get shape_dims of getnext_nosink if (ParseInputsDimsForGetNexNosinkAndData(getnext_nosink_nodes, input_tensor) != SUCCESS) { - GELOGE(PARAM_INVALID, "Failed to parse dims from getnext nosink, when data coexist with getnext nosink"); + GELOGE(PARAM_INVALID, "[Parse][Dims] from getnext nosink failed, when data coexist with getnext nosink"); return PARAM_INVALID; } } @@ -3376,9 +3380,8 @@ void GraphManager::ReturnError(GraphManager *graph_manager, GraphNodePtr &graph_ std::vector outputs; auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph()); if (graph_manager == nullptr || compute_graph == nullptr) { - REPORT_INNER_ERROR("E19999", "Param graph_manager or compute_graph in graph_node is nullptr, " - "check invalid"); - GELOGE(GRAPH_FAILED, "[Analyze Mode] compute graph is null!"); + REPORT_INNER_ERROR("E19999", "Param graph_manager or compute_graph in graph_node is nullptr, check invalid"); + GELOGE(GRAPH_FAILED, "[Check][Param] compute graph or graph manager is nullptr"); callback(GRAPH_FAILED, outputs); return; } @@ -3401,7 +3404,8 @@ void GraphManager::ReturnError(GraphManager *graph_manager, GraphNodePtr &graph_ REPORT_INNER_ERROR("E19999", "InputIndex:%zu ShapeSize:%ld of op:%s(%s) < 0, unknown shape is not support, " "check invalid", i, len, node->GetName().c_str(), node->GetType().c_str()); - GELOGE(GRAPH_FAILED, "Analyze Mode does not support GEOP output unknown shape!"); + GELOGE(GRAPH_FAILED, "[Check][Param] InputIndex:%zu ShapeSize:%ld of op:%s(%s) < 0, " + "unknown shape is not support", i, len, node->GetName().c_str(), node->GetType().c_str()); callback(GRAPH_FAILED, outputs); return; } else if (len == 0) { @@ -3411,8 +3415,8 @@ void GraphManager::ReturnError(GraphManager *graph_manager, GraphNodePtr &graph_ auto length = GetSizeInBytes(len, input_desc->GetDataType()); auto aligned_ptr = MakeShared(length, kAlignment); if (aligned_ptr == nullptr) { - REPORT_INNER_ERROR("E19999", "Aligned_ptr is nullptr"); - GELOGE(GRAPH_FAILED, "[Analyze Mode] Aligned_ptr is nullptr"); + REPORT_CALL_ERROR("E19999", "New AlignedPtr failed, len:%ld", length); + GELOGE(GRAPH_FAILED, "[Create][AlignedPtr] failed, len:%ld", length); return; } ge_tensor.SetData(aligned_ptr, length); @@ -3434,16 +3438,14 @@ bool GraphManager::IsGraphNeedRebuild(uint32_t graph_id) { GraphNodePtr graph_node = nullptr; Status ret = GetGraphNode(graph_id, graph_node); if (ret != SUCCESS) { - REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", - graph_id); - GELOGE(ret, "[RunGraph] graph not exist, graph_id=%u.", graph_id); + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", graph_id); + GELOGE(ret, "[Get][GraphNode] failed, graph not exist, graph_id:%u.", graph_id); return true; } if (graph_node == nullptr) { - REPORT_INNER_ERROR("E19999", "Graph node is nullptr in graph_map, graph_id:%u, check invalid", - graph_id); - GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[RunGraph] graph node is NULL, graphId=%u.", graph_id); + REPORT_INNER_ERROR("E19999", "Graph node is nullptr in graph_map, graph_id:%u, check invalid", graph_id); + GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[Check][Param] graph node is NULL, graph_id:%u.", graph_id); return true; } @@ -3457,16 +3459,14 @@ const map *GraphManager::GetGraphOptions(uint32_t grap GraphNodePtr graph_node = nullptr; Status ret = GetGraphNode(graph_id, graph_node); if (ret != SUCCESS) { - REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", - graph_id); - GELOGE(ret, "[RunGraph] graph not exist, graph_id=%u.", graph_id); + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", graph_id); + GELOGE(ret, "[Get][GraphNode] failed, graph not exist, graph_id:%u.", graph_id); return nullptr; } if (!graph_node) { - REPORT_INNER_ERROR("E19999", "Graph node is nullptr in graph_map, graph_id:%u, check invalid", - graph_id); - GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[RunGraph] graph node is NULL, graph_id=%u.", graph_id); + REPORT_INNER_ERROR("E19999", "Graph node is nullptr in graph_map, graph_id:%u, check invalid", graph_id); + GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[Check][Param] graph node is NULL, graph_id:%u.", graph_id); return nullptr; } return &(graph_node->GetOptions()); @@ -3482,7 +3482,7 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra StagePartitioner stage_partitioner(compute_graph); auto ret = stage_partitioner.Partition(); if (ret != SUCCESS) { - GELOGE(ret, "Graph partition by stage Failed"); + GELOGE(ret, "[Call][Partition] for Graph:%s by stage Failed", compute_graph->GetName().c_str()); return ret; } GE_TIMESTAMP_EVENT_END(StagePartition, "OptimizeSubgraph::StagePartition"); @@ -3491,14 +3491,16 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra DynamicShapePartitioner dynamic_shape_partitioner(compute_graph); ret = dynamic_shape_partitioner.Partition(); if (ret != SUCCESS) { - GELOGE(ret, "Graph partition by dynamic shape Failed"); + GELOGE(ret, "[Call][Partition] for Graph:%s by dynamic shape Failed", compute_graph->GetName().c_str()); return ret; } bool dynamic_shape_partitioned = false; if (!AttrUtils::GetBool(*compute_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, dynamic_shape_partitioned)) { - REPORT_INNER_ERROR("E19999", "Get Attr:%s from graph:%u fail", - ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED.c_str(), compute_graph->GetGraphID()); - GELOGE(FAILED, "failed get dynamic shape partitioned flag on partitioned graph."); + REPORT_INNER_ERROR("E19999", "Get Attr:%s from graph:%s(id:%u) fail", + ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED.c_str(), compute_graph->GetName().c_str(), + compute_graph->GetGraphID()); + GELOGE(FAILED, "[Get][Attr] %s from graph:%u failed", + ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED.c_str(), compute_graph->GetGraphID()); return FAILED; } GE_TIMESTAMP_EVENT_END(GraphPartitionDynamicShape, "OptimizeSubgraph::GraphPartitionDynamicShape"); @@ -3507,14 +3509,14 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra GraphPartitioner &partitioner = GetCompilerStages(graph_node->GetGraphId()).partitioner; ret = partitioner.Partition(compute_graph, GraphPartitioner::kPartitioning); if (ret != SUCCESS) { - GELOGE(ret, "Graph partition Failed"); + GELOGE(ret, "[Call][Partition] for Graph:%s Failed", compute_graph->GetName().c_str()); return ret; } GE_TIMESTAMP_EVENT_END(GraphPartition, "OptimizeSubgraph::Partition1"); GE_TIMESTAMP_START(SetSubgraph); ret = SetSubgraph(session_id, compute_graph, partitioner); if (ret != SUCCESS) { - GELOGE(ret, "Graph set subgraph Failed"); + GELOGE(ret, "[Set][Subgraph] failed for graph:%s, session_id:%lu", compute_graph->GetName().c_str(), session_id); return ret; } GE_TIMESTAMP_EVENT_END(SetSubgraph, "OptimizeSubgraph::SetSubGraph"); @@ -3527,7 +3529,7 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra Status ret = ConvertGraphToFile(compute_graph, partitioner, tuning_path, (options_.build_step == BUILD_STEP_AFTER_BUILDER)); if (ret != SUCCESS) { - GELOGE(ret, "Convert graph[%s] to file failed", compute_graph->GetName().c_str()); + GELOGE(ret, "[Convert][Graph] [%s] to file failed", compute_graph->GetName().c_str()); return ret; } GE_TIMESTAMP_EVENT_END(ConvertGraphToFile, "OptimizeSubgraph::ConvertGraphToFile"); @@ -3540,7 +3542,8 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra GE_TIMESTAMP_START(MergeSubgraph); ret = MergeSubGraph(merged_compute_graph, compute_graph, graph_node->GetGraphId()); if (ret != SUCCESS) { - GELOGE(ret, "Merge SubGraph Failed"); + GELOGE(ret, "[Merge][SubGraph] Failed, graph:%s(id:%u)", + compute_graph->GetName().c_str(), graph_node->GetGraphId()); return ret; } GE_CHECK_NOTNULL(merged_compute_graph); @@ -3557,8 +3560,8 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra if (!AttrUtils::SetBool(merged_compute_graph, ATTR_NAME_OFF_SUPERKERNEL_ATTR, off_superkernel)) { REPORT_INNER_ERROR("E19999", "Set Attr:%s to graph:%u fail", ATTR_NAME_OFF_SUPERKERNEL_ATTR.c_str(), compute_graph->GetGraphID()); - GELOGE(FAILED, "Compute graph %s set superkernel flag %d failed", merged_compute_graph->GetName().c_str(), - off_superkernel); + GELOGE(FAILED, "[Set][Attr] %s to graph:%u fail", + ATTR_NAME_OFF_SUPERKERNEL_ATTR.c_str(), compute_graph->GetGraphID()); return FAILED; } } @@ -3568,7 +3571,8 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra if (!AttrUtils::SetBool(*compute_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, dynamic_shape_partitioned)) { REPORT_INNER_ERROR("E19999", "Set Attr:%s to graph:%u fail", ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED.c_str(), compute_graph->GetGraphID()); - GELOGE(FAILED, "failed set dynamic shape partitioned flag on partitioned graph."); + GELOGE(FAILED, "[Set][Attr] %s to graph:%u fail", + ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED.c_str(), compute_graph->GetGraphID()); return FAILED; } return SUCCESS; @@ -3628,7 +3632,7 @@ Status GraphManager::Build(const GraphNodePtr &graph_node, ComputeGraphPtr &comp auto ret = GetCompilerStages(graph_node->GetGraphId()).builder.Build(compute_graph, ge_root_model, session_id); if (ret != SUCCESS) { - GELOGE(ret, "SubGraph build Failed."); + GELOGE(ret, "[Call][Build] failed, session_id:%lu.", session_id); return ret; } @@ -3666,7 +3670,7 @@ Status GraphManager::GenCheckPointGraph(const std::mapAddOutputDesc(iter->second)); NodePtr var_node = compute_graph->AddNode(var_desc); GE_CHK_STATUS(GraphUtils::AddEdge(var_node->GetOutDataAnchor(0), save_node->GetInDataAnchor(index)), - "Add edge[%s->%s] fail.", var_node->GetName().c_str(), save_node->GetName().c_str()); + "[Add][Edge][%s->%s] fail.", var_node->GetName().c_str(), save_node->GetName().c_str()); index++; } compute_graph->Dump(); @@ -3677,13 +3681,12 @@ Status GraphManager::GenCheckPointGraph(const std::map &var_names, const std::vector &outputs, std::vector &var_values) { map var_results; - GE_CHK_STATUS_RET(SaveCheckPointResult(graph, outputs, var_results), "Save check point result failed."); + GE_CHK_STATUS_RET(SaveCheckPointResult(graph, outputs, var_results), "[Save][CheckPointResult] failed."); if (!var_names.empty()) { for (const auto &var_name : var_names) { if (var_results.count(var_name) == 0) { - REPORT_INNER_ERROR("E19999", "Fetch Var:%s result value fail", - var_name.c_str()); - GELOGE(FAILED, "Fetch var[%s] value failed.", var_name.c_str()); + REPORT_INNER_ERROR("E19999", "Fetch Var:%s result value fail", var_name.c_str()); + GELOGE(FAILED, "[Check][Param] Fetch var[%s] value failed.", var_name.c_str()); return FAILED; } else { auto var_tensor = var_results[var_name].GetTensorDesc(); @@ -3724,7 +3727,8 @@ Status GraphManager::SaveCheckPointResult(const Graph &graph, const std::vector< REPORT_INNER_ERROR("E19999", "peer node:%s(%s) of netoutput has more than 1 input in checkpoint Graph, " "check invalid", peer_node->GetName().c_str(), peer_node->GetType().c_str()); - GELOGE(FAILED, "peer_node [%s] has more than 1 input in checkpoint Graph.", peer_node->GetName().c_str()); + GELOGE(FAILED, "[Check][Param] peer_node [%s] has more than 1 input in checkpoint Graph.", + peer_node->GetName().c_str()); return FAILED; } auto peer_node_in_anchor = peer_node->GetAllInDataAnchors().at(0); @@ -3740,7 +3744,7 @@ Status GraphManager::SaveCheckPointResult(const Graph &graph, const std::vector< REPORT_INNER_ERROR("E19999", "peer node:%s(%s) of netoutput is not variable in checkpoint Graph, " "check invalid", peer_node->GetName().c_str(), peer_node->GetType().c_str()); - GELOGE(FAILED, " peer_node %s is not variable in checkpoint Graph.", peer_node->GetName().c_str()); + GELOGE(FAILED, "[Check][Param] peer_node %s is not variable in checkpoint Graph.", peer_node->GetName().c_str()); return FAILED; } auto var_name = peer_node->GetName(); @@ -3748,7 +3752,7 @@ Status GraphManager::SaveCheckPointResult(const Graph &graph, const std::vector< if (in->GetIdx() >= static_cast(outputs.size())) { REPORT_INNER_ERROR("E19999", "In index:%u of netoutput is out of outputs.size:%zu range in checkpoint Graph, " "check invalid", in->GetIdx(), outputs.size()); - GELOGE(FAILED, "variable index[%d] out of range[%zu].", in->GetIdx(), outputs.size()); + GELOGE(FAILED, "[Check][Param] variable index[%d] out of range[%zu].", in->GetIdx(), outputs.size()); return FAILED; } var_results.emplace(var_name, outputs.at(in->GetIdx())); diff --git a/ge/graph/manager/graph_manager_utils.cc b/ge/graph/manager/graph_manager_utils.cc index 0f93654c..a70b15a6 100644 --- a/ge/graph/manager/graph_manager_utils.cc +++ b/ge/graph/manager/graph_manager_utils.cc @@ -46,7 +46,7 @@ GraphNode::GraphNode(GraphId graph_id) sem_(1) { graph_run_async_listener_ = MakeShared(); if (graph_run_async_listener_ == nullptr) { - GELOGE(MEMALLOC_FAILED, "Make shared failed"); + GELOGE(MEMALLOC_FAILED, "[New][RunAsyncListener] failed"); } } @@ -82,7 +82,8 @@ SubGraphInfo::~SubGraphInfo() { rt_ret = rtFreeHost(buffer_addr); buffer_addr = nullptr; if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); + GELOGE(rt_ret, "[Call][RtFreeHost] subgraph free buffer failed, modelId = %u", + model_id_info_.model_id); } } } @@ -94,8 +95,8 @@ Status SubGraphInfo::FreeInOutBuffer() { rtError_t rt_ret; rt_ret = rtFreeHost(*iter); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail"); - GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); + REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail, ret:%d", rt_ret); + GELOGE(rt_ret, "[Call][RtFreeHost] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); buffer_addr_.erase(buffer_addr_.begin(), iter); return GE_GRAPH_FREE_FAILED; } @@ -131,7 +132,7 @@ Status GraphModelListener::OnComputeDone(uint32_t model_id, uint32_t task_id, ui uint32_t GraphModelListener::GetResultCode() const { if (!is_finished_) { REPORT_CALL_ERROR("E19999", "Model not run finish"); - GELOGE(INTERNAL_ERROR, "[GraphManager] model not run finish."); + GELOGE(INTERNAL_ERROR, "[Check][Param] model not run finish."); return INTERNAL_ERROR; } return result_code_; @@ -170,7 +171,9 @@ bool HasCalcOp(const ComputeGraphPtr &graph) { for (const auto &node : graph->GetAllNodes()) { OpDescPtr op_desc = node->GetOpDesc(); - GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(FAILED, "Node GetOpDesc is nullptr"); return false); + GE_IF_BOOL_EXEC(op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "GetOpDesc failed, Node GetOpDesc is nullptr"); + GELOGE(FAILED, "[Get][OpDesc] failed, Node GetOpDesc is nullptr"); return false); if (calc_op_type.find(op_desc->GetType()) != calc_op_type.end()) { return true; } diff --git a/ge/graph/manager/graph_mem_allocator.cc b/ge/graph/manager/graph_mem_allocator.cc index 0cccaf99..dd38274e 100755 --- a/ge/graph/manager/graph_mem_allocator.cc +++ b/ge/graph/manager/graph_mem_allocator.cc @@ -50,9 +50,7 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, size_t memory_size if (rtMalloc(reinterpret_cast(&memory_addr), memory_size, memory_type_) != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, purpose:%s, size:%zu, device_id:%u", purpose.c_str(), memory_size, device_id); - GELOGE(ge::INTERNAL_ERROR, - "MemoryAllocator::MallocMemory device_id = %u," - " size= %lu", + GELOGE(ge::INTERNAL_ERROR, "[Malloc][Memory] failed, device_id = %u, size= %lu", device_id, memory_size); return nullptr; @@ -68,7 +66,7 @@ Status MemoryAllocator::FreeMemory(uint8_t *memory_addr, uint32_t device_id) con auto rtRet = rtFree(memory_addr); if (rtRet != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtFree fail, device_id:%u", device_id); - GELOGE(rtRet, "MemoryAllocator::MallocMemory device_id = %u", device_id); + GELOGE(rtRet, "[Call][RtFree] failed, device_id = %u", device_id); return RT_ERROR_TO_GE_STATUS(rtRet); } memory_addr = nullptr; @@ -88,10 +86,8 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, const string &memo if (memory_addr == nullptr) { REPORT_CALL_ERROR("E19999", "Malloc Memory fail, purpose:%s, memory_key:%s, memory_size:%zu, device_id:%u", purpose.c_str(), memory_key.c_str(), memory_size, device_id); - GELOGE(ge::INTERNAL_ERROR, - "MemoryAllocator::MallocMemory failed," - " memory_key[%s], size = %lu.", - memory_key.c_str(), memory_size); + GELOGE(ge::INTERNAL_ERROR, "[Malloc][Memory] failed, memory_key[%s], size = %lu, device_id:%u.", + memory_key.c_str(), memory_size, device_id); return nullptr; } @@ -126,10 +122,8 @@ Status MemoryAllocator::FreeMemory(const string &memory_key, uint32_t device_id) if (FreeMemory(it->second.memory_addr_, device_id) != ge::SUCCESS) { REPORT_CALL_ERROR("E19999", "Free Memory fail, memory_key:%s, device_id:%u", memory_key.c_str(), device_id); - GELOGE(ge::INTERNAL_ERROR, - "MemoryAllocator::FreeMemory rtFree failed," - " memory_key[%s]", - memory_key.c_str()); + GELOGE(ge::INTERNAL_ERROR, "[Free][Memory] failed, memory_key[%s], device_id:%u", + memory_key.c_str(), device_id); return ge::INTERNAL_ERROR; } diff --git a/ge/graph/manager/graph_var_manager.cc b/ge/graph/manager/graph_var_manager.cc index 5f7586da..64918aab 100755 --- a/ge/graph/manager/graph_var_manager.cc +++ b/ge/graph/manager/graph_var_manager.cc @@ -40,7 +40,8 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens if (dev_ptr == nullptr) { REPORT_INNER_ERROR("E19999", "Param dev_ptr is nullptr, var_name:%s, session_id:%lu, " "check invalid", var_name.c_str(), session_id_); - GELOGE(FAILED, "[GetVarAddr] dev_ptr is null!"); + GELOGE(FAILED, "[Check][Param] Param dev_ptr is nullptr, var_name:%s, session_id:%lu", + var_name.c_str(), session_id_); return FAILED; } std::string var_key = VarKey(var_name, tensor_desc); @@ -51,7 +52,8 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, " "check invalid", var_key.c_str(), var_name.c_str(), session_id_); - GELOGE(FAILED, "VarResource::GetVarAddr failed, var_key %s", var_key.c_str()); + GELOGE(FAILED, "[Check][Param] var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu", + var_key.c_str(), var_name.c_str(), session_id_); return FAILED; } @@ -109,7 +111,8 @@ ge::Status VarResource::SaveVarAddr(const std::string &var_name, const ge::GeTen REPORT_INNER_ERROR("E19999", "var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu, " "check invalid", var_key.c_str(), var_name.c_str(), session_id_); - GELOGE(FAILED, "VarResource::SaveVarAddr, var_key %s save addr conflict", var_key.c_str()); + GELOGE(FAILED, "[Check][Param] var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu", + var_key.c_str(), var_name.c_str(), session_id_); return FAILED; } @@ -145,14 +148,15 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O if (op_desc == nullptr) { REPORT_INNER_ERROR("E19999", "Param op_desc is nullptr, var_name:%s, session_id:%lu, check invalid", var_name.c_str(), session_id_); - GELOGE(FAILED, "[RenewCurVarDesc] renew var desc fail! input opdesc is null!"); + GELOGE(FAILED, "[Check][Param] input opdesc is nullptr, var_name:%s, session_id:%lu", + var_name.c_str(), session_id_); return FAILED; } ge::GeTensorDesc curr_desc; ge::Status ret = GetCurVarDesc(var_name, curr_desc); if (ret != SUCCESS) { - GELOGE(FAILED, "[RenewCurVarDesc] Get var desc fail!"); + GELOGE(FAILED, "[Get][CurVarDesc] fail, var_name:%s, session_id:%lu", var_name.c_str(), session_id_); return FAILED; } std::string key = VarKey(var_name, curr_desc); @@ -164,7 +168,8 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s), " "check invalid", key.c_str(), var_name.c_str(), session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); - GELOGE(FAILED, "[RenewCurVarDesc] can't find ele with key [%s]", key.c_str()); + GELOGE(FAILED, "[Check][Param] var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s)", + key.c_str(), var_name.c_str(), session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); return FAILED; } auto val = iter->second; @@ -285,14 +290,15 @@ Status HbmMemResource::AssignVarMem(const std::string &var_name, uint64_t size, if (total_size_ < var_mem_size_) { REPORT_INNER_ERROR("E19999", "VarMemMaxSize:%lu < var_mem_size_:%lu, var_size:%lu, var_name:%s, check invalid" "", total_size_, var_mem_size_, size, var_name.c_str()); - GELOGE(PARAM_INVALID, "total_size_: %lu is smaller than var_mem_size_: %lu", total_size_, var_mem_size_); + GELOGE(PARAM_INVALID, "[Check][Param] total_size_:%lu is smaller than var_mem_size_:%lu, var_name:%s", + total_size_, var_mem_size_, var_name.c_str()); return PARAM_INVALID; } uint64_t free_size = total_size_ - var_mem_size_; if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) { REPORT_INNER_ERROR("E19999", "free_size:%lu not enough, var_align_size:%lu, var_name:%s, check invalid", free_size, size, var_name.c_str()); - GELOGE(PARAM_INVALID, "Out of memory : current var size[%lu] exceeds total var size[%lu]", + GELOGE(PARAM_INVALID, "[Check][Param] Out of memory: current var size[%lu] exceeds total var size[%lu]", size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_); return PARAM_INVALID; } @@ -317,7 +323,7 @@ Status RdmaMemResource::AssignVarMem(const std::string &var_name, uint64_t size, if (buffer == nullptr) { REPORT_CALL_ERROR("E19999", "malloc rdma memory fail, var_size:%lu, var_name:%s", size, var_name.c_str()); - GELOGE(MEMALLOC_FAILED, "Failed to malloc rdma memory for node %s, size = %lu", var_name.c_str(), size); + GELOGE(MEMALLOC_FAILED, "[Malloc][RdmaMemory] for node %s failed, size = %lu", var_name.c_str(), size); return MEMALLOC_FAILED; } address = static_cast(reinterpret_cast(buffer)); @@ -468,7 +474,8 @@ int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) { if (mem_resource == nullptr) { REPORT_INNER_ERROR("E19999", "Find no mem_resource in map, memory_type:%d, session_id:%lu", memory_type, session_id_); - GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); + GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%d, session_id:%lu", + memory_type, session_id_); return 0; } return mem_resource->GetVarMemSize(); @@ -483,7 +490,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { if (mem_resource == nullptr) { REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", memory_type, session_id_); - GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); + GELOGE(ge::INTERNAL_ERROR, "[Alloc][MemResource] failed, memory_type:%u, session_id:%lu", + memory_type, session_id_); return ge::INTERNAL_ERROR; } else { mem_resource_map_[memory_type] = mem_resource; @@ -495,7 +503,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { if (mem_resource == nullptr) { REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", memory_type, session_id_); - GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); + GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%u, session_id:%lu", + memory_type, session_id_); return FAILED; } mem_resource->UpdateVarMemSize(mem_size); @@ -515,7 +524,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen if (result != ge::SUCCESS) { REPORT_CALL_ERROR("E19999", "Get size from tensor fail, var_name:%s, memory_type:%d, session_id:%lu", var_name.c_str(), memory_type, session_id_); - GELOGE(result, "get size from TensorDesc failed"); + GELOGE(result, "[Get][Size] from tensor fail, var_name:%s, memory_type:%u, session_id:%lu", + var_name.c_str(), memory_type, session_id_); return result; } @@ -526,7 +536,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen if (mem_resource == nullptr) { REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", memory_type, session_id_); - GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); + GELOGE(ge::INTERNAL_ERROR, "[Alloc][MemResource] failed, memory_type:%u, session_id:%lu.", + memory_type, session_id_); return ge::INTERNAL_ERROR; } else { mem_resource_map_[memory_type] = mem_resource; @@ -538,7 +549,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen if (mem_resource == nullptr) { REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", memory_type, session_id_); - GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid, memory_type = %u.", memory_type); + GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%u, session_id:%lu.", + memory_type, session_id_); return ge::INTERNAL_ERROR; } @@ -567,14 +579,15 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen if (can_not_reuse_old_memory) { result = mem_resource->AssignVarMem(var_name, tensor_desc_size, session_id_, mem_offset); if (result != SUCCESS) { - GELOGE(ge::INTERNAL_ERROR, "AssignVarMem by offset failed."); + GELOGE(ge::INTERNAL_ERROR, "[Assign][VarMem] by offset failed, session_id:%lu.", session_id_); return ge::INTERNAL_ERROR; } result = var_resource_->SaveVarAddr( var_name, tensor_desc, reinterpret_cast(static_cast(mem_offset)), memory_type); if (result != SUCCESS) { - GELOGE(ge::INTERNAL_ERROR, "AssignVarMem by offset failed."); + GELOGE(ge::INTERNAL_ERROR, "[Save][VarAddr] by offset failed, memory type:%u, session_id:%lu.", + memory_type, session_id_); return ge::INTERNAL_ERROR; } } @@ -681,7 +694,8 @@ ge::Status VarManager::RenewCurVarDesc(const std::string &var_name, ge::OpDescPt REPORT_INNER_ERROR("E19999", "VarManager has not been init, op:%s(%s), session_id:%lu, check invalid", op_desc->GetName().c_str(), op_desc->GetType().c_str(), session_id_); - GELOGE(ge::INTERNAL_ERROR, "VarManager has not been init."); + GELOGE(ge::INTERNAL_ERROR, "[Check][Param] VarManager has not been init, op:%s(%s), session_id:%lu", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), session_id_); return ge::INTERNAL_ERROR; } return var_resource_->RenewCurVarDesc(var_name, std::move(op_desc)); @@ -729,10 +743,8 @@ ge::Status VarManager::MallocVarMemory(size_t memory_size) { const string purpose("variables and constant op memory in training network."); var_mem_base = MemManager::Instance().MemInstance(RT_MEMORY_HBM).MallocMemory(purpose, memory_key, var_memory_size); if (var_mem_base == nullptr) { - GELOGE(ge::INTERNAL_ERROR, - "VarManager::MallocVarMemory failed " - "session_id = %s", - memory_key.c_str()); + GELOGE(ge::INTERNAL_ERROR, "[Malloc][VarMemory] failed, size:%zu, session_id:%s", + var_memory_size, memory_key.c_str()); return ge::INTERNAL_ERROR; } return SUCCESS; @@ -812,7 +824,7 @@ Status VarManager::SetMemoryMallocSize(const map &options) { string graph_memory_manager_malloc_max_size = it->second; ge::Status ret = ParseMemoryMallocSize(graph_memory_manager_malloc_max_size, graph_mem_max_size_); if (ret != SUCCESS) { - GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse graph memory manager malloc max size failed."); + GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Call][ParseMemoryMallocSize] failed, session id:%lu.", session_id_); return ge::GE_GRAPH_OPTIONS_INVALID; } GELOGI("The max size for graph mem is set to %zu", graph_mem_max_size_); @@ -825,7 +837,7 @@ Status VarManager::SetMemoryMallocSize(const map &options) { string memory_var_manager_malloc_size = it->second; ge::Status ret = ParseMemoryMallocSize(memory_var_manager_malloc_size, var_mem_max_size_); if (ret != SUCCESS) { - GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse memory var manager malloc size failed."); + GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Call][ParseMemoryMallocSize] failed, session id:%lu.", session_id_); return ge::GE_GRAPH_OPTIONS_INVALID; } } @@ -834,8 +846,8 @@ Status VarManager::SetMemoryMallocSize(const map &options) { if (var_mem_logic_base_ > kMaxMemorySize) { REPORT_INNER_ERROR("E19999", "var_login_base:%zu can not exeed limit:%zu, session_id:%lu, check invalid", var_mem_logic_base_, kMaxMemorySize, session_id_); - GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kMemoryVarLogicBase : %zu can not exceed max memory size : %zu.", - var_mem_logic_base_, kMaxMemorySize); + GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Check][Param] kMemoryVarLogicBase:%zu can not exceed " + "max memory size:%zu, session_id:%lu.", var_mem_logic_base_, kMaxMemorySize, session_id_); return ge::GE_GRAPH_OPTIONS_INVALID; } @@ -843,8 +855,8 @@ Status VarManager::SetMemoryMallocSize(const map &options) { if (use_max_mem_size_ > kMaxMemorySize) { REPORT_INNER_ERROR("E19999", "all mem_use size:%zu can not exeed limit:%zu, session_id:%lu, check invalid", use_max_mem_size_, kMaxMemorySize, session_id_); - GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kUseMaxMemorySize : %zu can not exceed max memory size : %zu.", - use_max_mem_size_, kMaxMemorySize); + GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Check][Param] kUseMaxMemorySize:%zu can not exceed " + "max memory size:%zu, session_id:%lu.", use_max_mem_size_, kMaxMemorySize, session_id_); return ge::GE_GRAPH_OPTIONS_INVALID; } GELOGI("Set memory malloc size successfully"); @@ -855,7 +867,7 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { if (memory_size.empty()) { REPORT_INNER_ERROR("E19999", "Param memory_size is empty, session_id:%lu, check invalid", session_id_); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input is empty."); + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] Memory malloc size input is empty, session_id:%lu.", session_id_); return GE_GRAPH_OPTIONS_INVALID; } // split string by '*' @@ -882,7 +894,9 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { if (!isdigit(c)) { REPORT_INNER_ERROR("E19999", "Param memory_size:%s contains non digit, session_id:%lu, check invalid", memory_size.c_str(), session_id_); - GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input contains non digit."); + GELOGE(GE_GRAPH_OPTIONS_INVALID, + "[Check][Param] Memory malloc size:%s input contains non digit, session_id:%lu.", + memory_size.c_str(), session_id_); return GE_GRAPH_OPTIONS_INVALID; } } @@ -891,13 +905,15 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { REPORT_INNER_ERROR("E19999", "Param memory_size:%s will overflow after multi all, session_id:%lu, " "check invalid", memory_size.c_str(), session_id_); - GELOGE(FAILED, "Input memory size is out of range."); + GELOGE(FAILED, "[Check][Param] Param memory_size:%s will overflow after multi all, session_id:%lu", + memory_size.c_str(), session_id_); return FAILED); if ((num > kMaxMemorySize) || (result * static_cast(num) > kMaxMemorySize)) { REPORT_INNER_ERROR("E19999", "Param memory_size:%s after multi will exceed limit:%lu, session_id:%lu, " "check invalid", memory_size.c_str(), kMaxMemorySize, session_id_); - GELOGE(FAILED, "Input memory size can not exceed max memory size : %zu.", kMaxMemorySize); + GELOGE(FAILED, "[Check][Param] Input memory size can not exceed max memory size:%zu, session_id:%lu.", + kMaxMemorySize, session_id_); return FAILED; } result *= static_cast(num); @@ -1001,10 +1017,7 @@ VarManager *VarManagerPool::GetVarManager(uint64_t session_id) { VarManager *var_manager = new (std::nothrow) VarManager(session_id); if (var_manager == nullptr) { REPORT_INNER_ERROR("E19999", "New VarManager fail, session_id:%lu", session_id); - GELOGE(INTERNAL_ERROR, - "VarManager::Instance find session by " - "session_id[%lu] failed.", - session_id); + GELOGE(INTERNAL_ERROR, "[New][VarManager] fail, session_id:%lu", session_id); static VarManager new_var_manager(0); return &new_var_manager; } diff --git a/ge/graph/manager/host_mem_allocator.cc b/ge/graph/manager/host_mem_allocator.cc index 98f9a313..588d08a2 100644 --- a/ge/graph/manager/host_mem_allocator.cc +++ b/ge/graph/manager/host_mem_allocator.cc @@ -34,8 +34,8 @@ uint8_t *HostMemAllocator::Malloc(size_t size) { std::lock_guard lock(mutex_); std::shared_ptr aligned_ptr = MakeShared(size); if (aligned_ptr == nullptr) { - REPORT_INNER_ERROR("E19999", "New AlignedPtr fail"); - GELOGE(INTERNAL_ERROR, "make shared_ptr for AlignedPtr failed"); + REPORT_INNER_ERROR("E19999", "New AlignedPtr fail, size:%zu", size); + GELOGE(INTERNAL_ERROR, "[Call][MakeShared] for AlignedPtr failed, size:%zu", size); return nullptr; } allocated_blocks_[aligned_ptr->Get()] = { size, aligned_ptr }; @@ -46,7 +46,7 @@ uint8_t *HostMemAllocator::Malloc(size_t size) { Status HostMemAllocator::Free(const void *memory_addr) { if (memory_addr == nullptr) { REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, check invalid"); - GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); + GELOGE(GE_GRAPH_FREE_FAILED, "[Check][Param] Invalid memory pointer"); return GE_GRAPH_FREE_FAILED; } @@ -54,7 +54,7 @@ Status HostMemAllocator::Free(const void *memory_addr) { auto it = allocated_blocks_.find(memory_addr); if (it == allocated_blocks_.end()) { REPORT_INNER_ERROR("E19999", "Memory_addr is not alloc before, check invalid"); - GELOGE(PARAM_INVALID, "Invalid memory pointer"); + GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer:%p", memory_addr); return PARAM_INVALID; } it->second.second.reset(); diff --git a/ge/graph/manager/host_mem_manager.cc b/ge/graph/manager/host_mem_manager.cc index 2908df39..63265bf4 100644 --- a/ge/graph/manager/host_mem_manager.cc +++ b/ge/graph/manager/host_mem_manager.cc @@ -39,9 +39,8 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) { rtMallocHostSharedMemoryOut output_para; rtError_t rt_ret = rtMallocHostSharedMemory(&input_para, &output_para); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X", - rt_ret); - GELOGE(RT_FAILED, "Call rt api(rtMallocHostSharedMemory) failed, devid:[%u].", device_id); + REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtMallocHostSharedMemory] failed, devid:[%u].", device_id); return GE_GRAPH_MEMORY_ALLOC_FAILED; } mem_info.fd = output_para.fd; @@ -60,9 +59,8 @@ Status SharedMemAllocator::DeAllocate(SharedMemInfo &mem_info) { mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address}; rtError_t rt_ret = rtFreeHostSharedMemory(&free_para); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X", - rt_ret); - GELOGE(RT_FAILED, "Call rt api(rtFreeHostSharedMemory) failed, ret: 0x%X.", rt_ret); + REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "[Call][RtFreeHostSharedMemory] failed, ret:0x%X.", rt_ret); return RT_FAILED; } return ge::SUCCESS; @@ -78,7 +76,7 @@ Status HostMemManager::Initialize() { allocator_ = std::unique_ptr(new (std::nothrow) SharedMemAllocator()); if (allocator_ == nullptr) { REPORT_CALL_ERROR("E19999", "New SharedMemAllocator fail"); - GELOGE(GE_GRAPH_MALLOC_FAILED, "Shared memory allocator init failed!"); + GELOGE(GE_GRAPH_MALLOC_FAILED, "[New][SharedMemAllocator] failed!"); return GE_GRAPH_MALLOC_FAILED; } return SUCCESS; @@ -98,9 +96,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) { std::lock_guard lock(mutex_); auto iter = var_memory_base_map_.find(mem_info.op_name); if (iter != var_memory_base_map_.end()) { - REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", - mem_info.op_name.c_str()); - GELOGE(FAILED, "Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); + REPORT_INNER_ERROR("E19999", "Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); + GELOGE(FAILED, "[Check][Param] Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); return FAILED; } mem_info.shm_name = OpNameToShmName(mem_info.op_name); @@ -113,9 +110,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) { Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size) { std::lock_guard lock(mutex_); if (var_memory_base_map_.find(op_name) == var_memory_base_map_.end()) { - REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", - op_name.c_str()); - GELOGE(INTERNAL_ERROR, "Find host base base_addr failed,node name:%s!", op_name.c_str()); + REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", op_name.c_str()); + GELOGE(INTERNAL_ERROR, "[Check][Param] Find host base base_addr failed, node name:%s!", op_name.c_str()); return INTERNAL_ERROR; } base_addr = static_cast(reinterpret_cast(var_memory_base_map_[op_name].device_address)); diff --git a/ge/graph/manager/memory_api.cc b/ge/graph/manager/memory_api.cc index 8e737021..8ea0594b 100644 --- a/ge/graph/manager/memory_api.cc +++ b/ge/graph/manager/memory_api.cc @@ -50,9 +50,8 @@ Status RdmaRemoteRegister(const std::vector &var_info, rtMemType_t path.append(file_name); string canonical_path = RealPath(path.c_str()); if (canonical_path.empty()) { - REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid", - canonical_path.c_str()); - GELOGE(FAILED, "Failed to get realpath of %s", path.c_str()); + REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid", canonical_path.c_str()); + GELOGE(FAILED, "[Call][RealPath] Failed to get realpath of %s", path.c_str()); return FAILED; } GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); @@ -69,15 +68,14 @@ Status RdmaRemoteRegister(const std::vector &var_info, rtMemType_t if (hcom_remote_mem_register == nullptr) { REPORT_CALL_ERROR("E19999", "Symbol HcomRegRemoteAccessMem can't find in %s, check invalid", canonical_path.c_str()); - GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function."); + GELOGE(FAILED, "[Check][Param] Symbol HcomRegRemoteAccessMem can't find in %s", canonical_path.c_str()); return FAILED; } HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len); if (hccl_ret != HCCL_SUCCESS) { - REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,", - hccl_ret); - GELOGE(HCCL_E_INTERNAL, "Rdma mem register failed, ret: 0x%X", hccl_ret); + REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,", hccl_ret); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomRemoteMemRegister] Rdma mem register failed, ret:0x%X", hccl_ret); return HCCL_E_INTERNAL; } return SUCCESS; @@ -88,14 +86,14 @@ Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uin uint32_t type_size = 0; bool result = TypeUtils::GetDataTypeLength(tensor_info.data_type, type_size); if (!result) { - GELOGE(GRAPH_FAILED, "GetDataTypeLength failed, data_type=(%s).", + GELOGE(GRAPH_FAILED, "[Get][DataTypeLength] failed, data_type=(%s).", TypeUtils::DataTypeToSerialString(tensor_info.data_type).c_str()); return GRAPH_FAILED; } memory_size = type_size; for (auto dim : tensor_info.dims) { if (dim <= 0) { - GELOGE(GRAPH_FAILED, "Tensor dims should be positive"); + GELOGE(GRAPH_FAILED, "[Check][Param] Tensor dims should be positive"); return GRAPH_FAILED; } memory_size *= dim; @@ -103,7 +101,7 @@ Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uin SharedMemInfo mem_info(tensor_info.var_name, memory_size); Status ret = HostMemManager::Instance().MallocSharedMemory(mem_info); if (ret != SUCCESS) { - GELOGE(GRAPH_FAILED, "MallocSharedMemory failed op name [%s]", tensor_info.var_name.c_str()); + GELOGE(GRAPH_FAILED, "[Malloc][SharedMemory] failed, op name [%s]", tensor_info.var_name.c_str()); return GRAPH_FAILED; } dev_addr = reinterpret_cast(reinterpret_cast(mem_info.device_address)); diff --git a/ge/graph/manager/model_manager/event_manager.cc b/ge/graph/manager/model_manager/event_manager.cc index 69a946f9..339e9894 100644 --- a/ge/graph/manager/model_manager/event_manager.cc +++ b/ge/graph/manager/model_manager/event_manager.cc @@ -45,7 +45,7 @@ Status EventManager::Init(size_t event_num) { void EventManager::Release() noexcept { for (size_t i = 0; i < this->event_list_.size(); ++i) { rtError_t rt_ret = rtEventDestroy(this->event_list_[i]); - RETURN_IF_COND_NOT_MET(rt_ret == RT_ERROR_NONE, "Destroy event failed, idx is %zu, ret is 0x%x.", i, rt_ret); + RETURN_IF_COND_NOT_MET(rt_ret == RT_ERROR_NONE, "[Destroy][Event] failed, idx is %zu, ret is 0x%x.", i, rt_ret); } this->event_list_.clear(); diff --git a/ge/graph/manager/rdma_pool_allocator.cc b/ge/graph/manager/rdma_pool_allocator.cc index 58829adb..4297be95 100644 --- a/ge/graph/manager/rdma_pool_allocator.cc +++ b/ge/graph/manager/rdma_pool_allocator.cc @@ -82,8 +82,8 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) { auto device_id = GetContext().DeviceId(); GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id); if (rdma_base_addr_ != nullptr) { - REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); - GELOGE(GE_MULTI_INIT, "Rdma pool has been malloced"); + REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is not nullptr, devid:%u, check invalid", device_id); + GELOGE(GE_MULTI_INIT, "[Check][Param] Rdma pool has been malloced, devid:%u", device_id); return GE_MULTI_INIT; } const std::string purpose = "Memory for rdma pool."; @@ -95,15 +95,15 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) { rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id); if (rdma_base_addr_ == nullptr) { - GELOGE(GE_GRAPH_MALLOC_FAILED, "Rdma pool memory malloc failed"); + GELOGE(GE_GRAPH_MALLOC_FAILED, "[Malloc][Memory] failed, size:%zu, device_id:%u", mem_size, device_id); return GE_GRAPH_MALLOC_FAILED; } rdma_mem_size_ = mem_size; // Init with a base block. auto *base_block = new (std::nothrow) Block(device_id, mem_size, rdma_base_addr_); if (base_block == nullptr) { - REPORT_CALL_ERROR("E19999", "New Block failed, device_id:%u", device_id); - GELOGE(GE_GRAPH_MALLOC_FAILED, "Block malloc failed"); + REPORT_CALL_ERROR("E19999", "New Block failed, size:%zu, device_id:%u", mem_size, device_id); + GELOGE(GE_GRAPH_MALLOC_FAILED, "[New][Block] failed, size:%zu, device_id:%u", mem_size, device_id); return GE_GRAPH_MALLOC_FAILED; } block_bin_.insert(base_block); @@ -123,7 +123,7 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { if (block->ptr == nullptr) { REPORT_INNER_ERROR("E19999", "Rdmapool memory address is nullptr, device_id:%u, check invalid", device_id); - GELOGE(INTERNAL_ERROR, "Rdmapool memory address is nullptr."); + GELOGE(INTERNAL_ERROR, "[Check][Param] Rdmapool memory address is nullptr, device_id:%u", device_id); return nullptr; } allocated_blocks_.emplace(block->ptr, block); @@ -155,9 +155,8 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { GELOGI("Free rdma memory, device id = %u", device_id); if (memory_addr == nullptr) { - REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid", - device_id); - GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); + REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid", device_id); + GELOGE(GE_GRAPH_FREE_FAILED, "[Check][Param] Invalid memory pointer, device id:%u", device_id); return GE_GRAPH_FREE_FAILED; } @@ -166,7 +165,7 @@ Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { if (it == allocated_blocks_.end()) { REPORT_INNER_ERROR("E19999", "Param memory_addr is not allocated before, device_id:%u, " "check invalid", device_id); - GELOGE(PARAM_INVALID, "Invalid memory pointer"); + GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer, device id:%u", device_id); return PARAM_INVALID; } @@ -209,7 +208,7 @@ void RdmaPoolAllocator::MergeBlocks(Block *dst, Block *src) { Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) { if (rdma_base_addr_ == nullptr) { REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); - GELOGE(INTERNAL_ERROR, "Rdma base addr is nullptr."); + GELOGE(INTERNAL_ERROR, "[Check][Param] Rdma base addr is nullptr."); return INTERNAL_ERROR; } base_addr = static_cast(reinterpret_cast(rdma_base_addr_)); diff --git a/ge/graph/manager/trans_var_data_utils.cc b/ge/graph/manager/trans_var_data_utils.cc index 9c1290fa..621eba79 100644 --- a/ge/graph/manager/trans_var_data_utils.cc +++ b/ge/graph/manager/trans_var_data_utils.cc @@ -37,7 +37,8 @@ class RtContextSwitchGuard { if (ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, device_id:%u, ret:0x%X,", device_id, ret); - GELOGE(RT_FAILED, "Failed to get current context from rt, error-code %d", ret); + GELOGE(RT_FAILED, "[Call][RtCtxGetCurrent] Failed to get current context, device_id:%u, ret:0x%X", + device_id, ret); return; } @@ -45,15 +46,14 @@ class RtContextSwitchGuard { if (ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtCtxCreate failed, device_id:%u, ret:0x%X,", device_id, ret); - GELOGE(RT_FAILED, "Failed to create new context for device %u, error-code %d", device_id, ret); + GELOGE(RT_FAILED, "[Call][RtCtxCreate] Failed to create new context for device:%u, ret:%d", device_id, ret); return; } ret = rtCtxSetCurrent(current_); if (ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X,", - device_id, ret); - GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id); + REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X", device_id, ret); + GELOGE(RT_FAILED, "[Call][RtCtxSetCurrent] failed, device_id:%u, ret:0x%X", device_id, ret); return; } GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_); @@ -80,7 +80,7 @@ int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) { if (var_size <= 0) { REPORT_INNER_ERROR("E19999", "Data type:%s in desc, it's size:%ld < 0, check invalid", TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str(), var_size); - GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s", + GELOGE(PARAM_INVALID, "[Calc][VarDataSize] by data type %s failed.", TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str()); return -1; } @@ -99,7 +99,8 @@ Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_res if (ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), var->GetType().c_str(), trans_result.length, ret); - GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", trans_result.length); + GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), + var->GetType().c_str(), trans_result.length, ret); return RT_FAILED; } return SUCCESS; @@ -111,21 +112,17 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt GE_CHECK_NOTNULL(var); auto ret = VarManager::Instance(session_id)->GetVarAddr(var->GetName(), input_desc, &var_logic); if (ret != SUCCESS) { - GELOGE(INTERNAL_ERROR, - "Failed to copy var %s from device, can not find it" - " from var manager %u", - var->GetName().c_str(), ret); + GELOGE(INTERNAL_ERROR, "[Get][VarAddr] failed, node:%s, session_id:%lu, ret:%d", + var->GetName().c_str(), session_id, ret); return INTERNAL_ERROR; } uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); if (var_addr == nullptr) { - REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu,", + REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu", RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); - GELOGE(INTERNAL_ERROR, - "Failed to copy var %s from device, cant not get " - "var addr from logic addr %p", - var->GetName().c_str(), var_logic); + GELOGE(INTERNAL_ERROR, "[Get][VarMemoryAddr] failed, mem_type:%d, op:%s(%s), session_id:%lu", + RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); return INTERNAL_ERROR; } @@ -136,9 +133,10 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt std::unique_ptr var_host(new(std::nothrow) uint8_t[var_size_bytes]); if (var_host == nullptr) { - REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu,", + REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu", var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); - GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %ld", var_size_bytes); + GELOGE(OUT_OF_MEMORY, "[New][Memory] for rt-host failed, size:%ld, op:%s(%s), session_id:%lu", + var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); return OUT_OF_MEMORY; } @@ -147,10 +145,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt if (ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); - GELOGE(RT_FAILED, - "Failed to copy var memory from device, var %s, size %ld," - " rt-error-code %u", - var->GetName().c_str(), var_size_bytes, ret); + GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", + var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); return RT_FAILED; } @@ -197,9 +193,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); - GELOGE(INTERNAL_ERROR, - "Failed to trans format from %s to %s, shape %s to %s, " - "data type %s error code %u", + GELOGE(INTERNAL_ERROR, "[Trans][Format] from %s to %s, shape %s to %s failed, data type %s error code %u", TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); @@ -221,7 +215,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats TypeUtils::DataTypeToSerialString(src_data_type).c_str(), TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), src_data_size, ret); - GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %ld, error code %u", + GELOGE(INTERNAL_ERROR, "[Trans][DataType] from %s to %s failed, input shape %s, data size %ld, error code %u", TypeUtils::DataTypeToSerialString(src_data_type).c_str(), TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), src_data_size, ret); @@ -230,7 +224,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats } else { REPORT_INNER_ERROR("E19999", "Trans var data failed, the trans type %s does not supported, check invalid", trans_info.node_type.c_str()); - GELOGE(UNSUPPORTED, "Failed to trans var data, the trans type %s does not supported", + GELOGE(UNSUPPORTED, "[Trans][VarData] failed, the trans type %s does not supported", trans_info.node_type.c_str()); return UNSUPPORTED; } @@ -255,10 +249,8 @@ Status ReAssignVarAddr(uint64_t session_id, uint8_t *var_logic = nullptr; Status ret = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &var_logic); if (ret != SUCCESS) { - GELOGE(INTERNAL_ERROR, - "Failed to get var %s device addr, can not find it" - " from var manager %u", - var_name.c_str(), ret); + GELOGE(INTERNAL_ERROR, "[Get][VarAddr] failed, var name:%s, session_id:%lu, ret:%u", + var_name.c_str(), session_id, ret); return INTERNAL_ERROR; } @@ -266,7 +258,8 @@ Status ReAssignVarAddr(uint64_t session_id, if (var_addr == nullptr) { REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, var_name:%s, session_id:%lu,", RT_MEMORY_HBM, var_name.c_str(), session_id); - GELOGE(INTERNAL_ERROR, "Failed to convert var %s logic addr to real addr", var_name.c_str()); + GELOGE(INTERNAL_ERROR, "[Get][VarMemoryAddr] failed, mem_type:%d, var_name:%s, session_id:%lu", + RT_MEMORY_HBM, var_name.c_str(), session_id); return INTERNAL_ERROR; } *var_device = var_addr; @@ -293,9 +286,8 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t // Sync var data from device std::unique_ptr var_data; if (trans_road.empty()) { - REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid", - session_id); - GELOGE(INTERNAL_ERROR, "Failed to get trans_road, trans_road is empty."); + REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid", session_id); + GELOGE(INTERNAL_ERROR, "[Check][Param] trans_road is empty, session_id:%lu", session_id); return INTERNAL_ERROR; } const GeTensorDesc &input_desc = trans_road.begin()->input; @@ -307,7 +299,7 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t formats::TransResult trans_result{}; ret = TransVarOnHost(var_data.get(), trans_road, trans_result); if (ret != SUCCESS) { - GELOGE(ret, "Failed to trans var data on host, error code %u", ret); + GELOGE(ret, "[Call][TransVarOnHost] failed, session_id:%lu, ret:%u", session_id, ret); return ret; } @@ -319,14 +311,15 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t /// TensorDesc needs to be removed. This change is large and needs to be performed step by step. ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device); if (ret != SUCCESS) { - GELOGE(ret, "Failed to re-assign memory on device, size %zu", trans_result.length); + GELOGE(ret, "[Call][ReAssignVarAddr] failed, session id:%lu, op:%s, ret:%u", + session_id, var->GetName().c_str(), ret); return ret; } // sync new data to device ret = CopyVarToDevice(var, trans_result, var_device); if (ret != SUCCESS) { - GELOGE(ret, "Failed to send var data to device"); + GELOGE(ret, "[Call][CopyVarToDevice] failed, var:%s, ret:%u", var->GetName().c_str(), ret); return ret; } @@ -350,7 +343,10 @@ Status TransTensor(uint8_t *var_data, const NodePtr &var_src, const NodePtr &var TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), src_data_shape_size, ret); - GELOGE(INTERNAL_ERROR, "trans var data on host failed"); + GELOGE(INTERNAL_ERROR, "[Trans][DataType] from %s to %s failed, data size %ld, ret:%u", + TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), + TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), + src_data_shape_size, ret); return ret; }); return SUCCESS; @@ -366,9 +362,11 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src, /// need copy value from var_fp32 to var_fp16. /// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr] GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, - REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is empty, session_id:%lu, device_id:%u, " + REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is nullptr, session_id:%lu, device_id:%u, " "check invalid", session_id, device_id); - GELOGE(FAILED, "node var is nullptr"); return FAILED); + GELOGE(FAILED, "[Check][Param] Param var_src or var_dst is nullptr, session_id:%lu, device_id:%u", + session_id, device_id); + return FAILED); // src_node output_desc (fp32) GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0); auto src_data_type = output_desc.GetDataType(); @@ -390,31 +388,45 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src, RtContextSwitchGuard switch_context(RT_CTX_NORMAL_MODE, device_id); // copy from src_node auto ret = CopyVarFromDevice(session_id, var_src, var_src_data, output_desc); - GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "Copy Var From Device failed"); return ret); + GE_IF_BOOL_EXEC(ret != SUCCESS, + GELOGE(FAILED, "[Call][CopyVarFromDevice] failed, session id:%lu, var_src:%s", + session_id, var_src->GetName().c_str()); + return ret); // trans dtype formats::TransResult trans_result{}; ret = TransTensor(var_src_data.get(), var_src, var_dst, trans_result); - GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "trans var data on host failed"); return ret); + GE_IF_BOOL_EXEC(ret != SUCCESS, + GELOGE(INTERNAL_ERROR, "[Trans][Tensor] failed, var_src:%s, var_dst:%s", + var_src->GetName().c_str(), var_dst->GetName().c_str()); + return ret); // reset src value. void *var_device = nullptr; ret = ReAssignVarAddr(session_id, var_dst->GetName(), dst_tensor_desc, &var_device); - GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "assign mem failed"); return ret); + GE_IF_BOOL_EXEC(ret != SUCCESS, + GELOGE(INTERNAL_ERROR, "[Call][ReAssignVarAddr] failed, session id:%lu, var_dst:%s", + session_id, var_dst->GetName().c_str()); + return ret); // copy to device ret = CopyVarToDevice(var_dst, trans_result, var_device); - GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Failed to send var data to device"); return ret); + GE_IF_BOOL_EXEC(ret != SUCCESS, + GELOGE(ret, "[Call][CopyVarToDevice] failed, var_dst:%s, ret:%u", + var_dst->GetName().c_str(), ret); + return ret); return SUCCESS; } } // namespace Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id) { - GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "dst addr is null. "); + GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "[Check][Param] dst addr is nullptr."); uint8_t *src_host_addr = nullptr; int64_t src_addr_size = 0; GE_MAKE_GUARD_RTMEM(src_host_addr); GE_CHK_STATUS_RET(SyncTensorToHost(var_name, src_tensor_desc, &src_host_addr, src_addr_size, session_id)); GELOGI("src_addr_size: %ld, dst_addr_size: %ld", src_addr_size, dst_addr_size); - GE_CHK_BOOL_RET_STATUS(src_addr_size == dst_addr_size, FAILED, "var data size is not equal broadcast "); + GE_CHK_BOOL_RET_STATUS(src_addr_size == dst_addr_size, FAILED, + "[Check][Param] src_addr_size:%ld not equal to dst_addr_size:%ld", + src_addr_size, dst_addr_size); GE_CHK_RT_RET(rtMemcpy(dst_addr, dst_addr_size, src_host_addr, src_addr_size, RT_MEMCPY_HOST_TO_DEVICE)); return SUCCESS; @@ -422,7 +434,7 @@ Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name, const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) { - GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "src addr is null. "); + GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "[Check][Param] src addr is nullptr. "); uint8_t *host_addr = nullptr; GE_MAKE_GUARD_RTMEM(host_addr); GE_CHK_RT_RET(rtMallocHost(reinterpret_cast(&host_addr), src_addr_size)); @@ -436,7 +448,7 @@ Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_a Status TransVarDataUtils::SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, uint8_t **host_addr, int64_t &src_tensor_size, uint64_t session_id) { - GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "get size from TensorDesc failed"); + GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "[Get][Size] from TensorDesc failed"); uint8_t *src_addr = nullptr; GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, src_tensor_desc, &src_addr)); @@ -493,7 +505,8 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, ret:0x%X,", session_id, graph_id, rt_ret); - GELOGE(RT_FAILED, "Failed to set context, error_code is: 0x%X.", rt_ret); + GELOGE(RT_FAILED, "[Call][RtCtxSetCurrent] failed, session_id:%lu, graph_id:%u, ret:0x%X,", + session_id, graph_id, rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } uint32_t allocated_graph_id = 0; @@ -501,8 +514,8 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, if (ret != SUCCESS) { REPORT_CALL_ERROR("E19999", "Get allocated GraphId failed, session_id:%lu, graph_id:%u, ret:0x%X,", session_id, graph_id, ret); - GELOGE(INTERNAL_ERROR, "var has not been allocated, node:%s, graph_id:%u.", node->GetName().c_str(), - graph_id); + GELOGE(INTERNAL_ERROR, "[Get][AllocatedGraphId] failed, node:%s, graph_id:%u.", + node->GetName().c_str(), graph_id); return INTERNAL_ERROR; } uint32_t changed_graph_id = 0; @@ -518,7 +531,8 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, } ret = TransVarData(node, *trans_road, session_id); if (ret != SUCCESS) { - GELOGE(INTERNAL_ERROR, "TransVarData failed, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id); + GELOGE(INTERNAL_ERROR, "[Trans][VarData] failed, node:%s, graph_id:%u, session_id:%lu.", + node->GetName().c_str(), graph_id, session_id); return INTERNAL_ERROR; } VarManager::Instance(session_id)->RemoveChangedGraphId(node->GetName()); @@ -527,7 +541,7 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, }, node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorManagerContext()); if (!f.valid()) { - GELOGE(FAILED, "Future is invalid"); + GELOGE(FAILED, "[Check][Param] Future is invalid, session id:%lu, graph id:%u", session_id, graph_id); return FAILED; } vector_future.push_back(std::move(f)); @@ -537,7 +551,7 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, for (size_t i = 0; i < vector_future.size(); ++i) { ret_status = vector_future[i].get(); if (ret_status != SUCCESS) { - GELOGE(ret_status, "TransAllVarData:: trans %zu vardata failed", i); + GELOGE(ret_status, "[Check][Param] trans %zu vardata failed", i); return ret_status; } } @@ -550,7 +564,8 @@ Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint if (compute_graph == nullptr) { REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, session_id:%lu, device_id:%u, check invalid", session_id, device_id); - GELOGE(FAILED, "compute_graph is nullptr"); + GELOGE(FAILED, "[Check][Param] compute_graph is nullptr, session_id:%lu, device_id:%u", + session_id, device_id); return FAILED; } @@ -568,7 +583,10 @@ Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint GELOGI("current_var_node__: [%s] copy_from_var_node__: [%s].", node->GetName().c_str(), src_node->GetName().c_str()); auto ret = CopyTensorFromSrcVarNode(src_node, node, session_id, device_id); - GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "copy tensor failed!"); return FAILED); + GE_IF_BOOL_EXEC(ret != SUCCESS, + GELOGE(FAILED, "[Copy][Tensor] failed, src_node:%s, node:%s, session_id:%lu, device_id:%u", + src_node->GetName().c_str(), node->GetName().c_str(), session_id, device_id); + return FAILED); // only copy once (void) ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true); // no need to check value } diff --git a/ge/graph/manager/util/debug.cc b/ge/graph/manager/util/debug.cc index 1dd97bc1..d20280fb 100644 --- a/ge/graph/manager/util/debug.cc +++ b/ge/graph/manager/util/debug.cc @@ -63,17 +63,15 @@ Status Debug::DumpDevMem(const char *file, const void *addr, int64_t size) { uint8_t *host_addr = nullptr; rtError_t ret = rtMallocHost(reinterpret_cast(&host_addr), size); if (ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret: 0x%X", - size, ret); - GELOGE(FAILED, "Call rt api rtMallocHost failed, ret: 0x%X", ret); + REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret:0x%X", size, ret); + GELOGE(FAILED, "[Call][RtMallocHost] failed, size:%zu, ret:0x%X", size, ret); return FAILED; } GE_MAKE_GUARD_RTMEM(host_addr); ret = rtMemcpy(host_addr, size, addr, size, RT_MEMCPY_DEVICE_TO_HOST); if (ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", - size, ret); - GELOGE(FAILED, "Call rt api rtMemcpy failed, ret: 0x%X", ret); + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", size, ret); + GELOGE(FAILED, "[Call][RtMemcpy] failed, size:%zu, ret:0x%X", size, ret); return FAILED; } diff --git a/ge/graph/manager/util/hcom_util.cc b/ge/graph/manager/util/hcom_util.cc index a30321f9..1843127f 100644 --- a/ge/graph/manager/util/hcom_util.cc +++ b/ge/graph/manager/util/hcom_util.cc @@ -28,7 +28,8 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, std::vector &kernel_hccl_infos) { GE_CHECK_NOTNULL(op_desc); if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { - GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid."); + GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] failed, op:%s(%s).", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } GELOGI("GetHcclDataType start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); @@ -40,10 +41,10 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, if (op_desc->GetType() == HCOMRECEIVE) { bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type); if (ret == false) { - REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", - HCOM_ATTR_DATA_TYPE.c_str(), + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); - GELOGE(PARAM_INVALID, "op:HcomReceive, op desc no attr: dtype."); + GELOGE(PARAM_INVALID, "[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_DATA_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } } else { @@ -55,13 +56,11 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, auto iter = kConstOpHcclDataType.find(static_cast(src_data_type)); if (iter == kConstOpHcclDataType.end()) { REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value data_type:%s, not support in kConstOpHcclDataType now, " - "check invalid", HCOM_ATTR_DATA_TYPE.c_str(), - op_desc->GetName().c_str(), op_desc->GetType().c_str(), - ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); - GELOGE(PARAM_INVALID, - "HcomOmeUtil:: Node: %s Optype: %s HcomDataType cann't support! Current Davinci Data Type : %s", - op_desc->GetName().c_str(), op_desc->GetType().c_str(), - ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); + "check invalid", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(), + op_desc->GetType().c_str(), ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); + GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in op:%s(%s), value data_type:%s, " + "not support in kConstOpHcclDataType now", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(), + op_desc->GetType().c_str(), ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); return PARAM_INVALID; } @@ -73,7 +72,7 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, Status HcomOmeUtil::GetHcclTypeSize(HcclDataType data_type, int32_t &size) { auto iter = kConstOpHcclDataTypeSize.find(data_type); GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID, - "HcomOmeUtil::HcomDataTypeSize , No DataTypeSize!"); + "[Check][Param] param data_type:%d not find", data_type); size = iter->second; return SUCCESS; @@ -83,21 +82,22 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType int &count) { GE_CHECK_NOTNULL(op_desc); if (!IsHCOMOp(op_desc->GetType())) { - REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid", - op_desc->GetName().c_str(), op_desc->GetType().c_str()); - GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Hcom operator."); + REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid", op_desc->GetName().c_str(), + op_desc->GetType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param] Op:%s(%s) is not hcom op", op_desc->GetName().c_str(), + op_desc->GetType().c_str()); return PARAM_INVALID; } int64_t total_size = 0; int64_t align_size = 512; int32_t size = 0; - GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "GetHcomCount: GetHcclTypeSize fail!"); + GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "[Get][HcclTypeSize] fail, datatype:%d", data_type); if (op_desc->GetType() == HCOMRECEIVE) { for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { int64_t output_size = 0; GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), output_size), - "Get size from TensorDesc failed, op: %s, output index: %zu.", op_desc->GetName().c_str(), i); + "[Get][Size] from TensorDesc failed, op:%s, output index:%zu.", op_desc->GetName().c_str(), i); output_size = (output_size + align_size - 1) / align_size * align_size; total_size += output_size; } @@ -107,42 +107,48 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType int64_t block_size = 0; GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), - "get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); + "[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); // dynamic shape hccl op get size from output tensor desc if (op_desc->HasAttr(ATTR_NAME_IS_UNKNOWN_SHAPE)) { GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), input_size), - "get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); + "[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); } GE_IF_BOOL_EXEC( op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0; GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_RANK_SIZE, rank_size), PARAM_INVALID, - "get HCOM_ATTR_RANK_SIZE failed"); - GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "rank size is zero"); - int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); GE_CHK_STATUS_RET( - ge::CheckInt64Uint32MulOverflow(shape_size, size), "Product of shape size and size beyond INT64_MAX"); + "[Get][Attr] %s in op:%s(%s) failed", HCOM_ATTR_RANK_SIZE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "[Check][Param] rank size is zero"); + int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); + GE_CHK_STATUS_RET(ge::CheckInt64Uint32MulOverflow(shape_size, size), + "[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX, op:%s(%s)", + shape_size, size, op_desc->GetName().c_str(), op_desc->GetType().c_str()); block_size = (shape_size * size) / rank_size; - GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); + GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), + "[Check][Param] Total size:%ld is beyond the INT64_MAX, op:%s(%s)", + total_size, op_desc->GetName().c_str(), op_desc->GetType().c_str()); total_size = total_size + block_size; continue;); int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); GELOGD("hcom util node %s inputsize %ld, shapesize %ld, datasize %d.", op_desc->GetName().c_str(), input_size, shape_size, size); GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), - "Product of shape size and size beyond INT64_MAX"); + "[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX", shape_size, size); GE_IF_BOOL_EXEC(is_allgather, block_size = shape_size * size;); GE_IF_BOOL_EXEC(!is_allgather, block_size = (input_size + align_size - 1) / align_size * align_size;); - GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); + GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), + "[Check][Param] Total size:%ld is beyond the INT64_MAX", total_size); total_size = total_size + block_size; } } - GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero"); + GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "[Check][Param] Size is zero"); count = static_cast(total_size / size); - GE_CHK_BOOL_EXEC(total_size % size == 0, return PARAM_INVALID, "total_size:%ld is not divisiable by size:%d.", - total_size, size); + GE_CHK_BOOL_EXEC(total_size % size == 0, return PARAM_INVALID, + "[Check][Param] total_size:%ld is not divisiable by size:%d.", total_size, size); return SUCCESS; } @@ -153,32 +159,34 @@ Status HcomOmeUtil::GetHorovodCount(const ge::ConstOpDescPtr &op_desc, if (!IsHorovodOp(op_desc->GetType())) { REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not horovod op, check invalid", op_desc->GetName().c_str(), op_desc->GetType().c_str()); - GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Horovod operator."); + GELOGE(PARAM_INVALID, "[Call][IsHorovodOp] failed, Op:%s(%s) is not horovod op", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } int64_t align_size = 512; int32_t size = 0; for (size_t i = 0; i < op_desc->GetInputsSize(); i++) { GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast(kernel_hccl_infos[i].dataType), size), - "GetHorovodCount: GetHcclTypeSize fail!"); + "[Call][GetHcclTypeSize] fail, op:%s(%s)", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); int64_t input_size = 0; int64_t block_size = 0; GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), - "get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); + "[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), - "Product of shape size and size beyond INT64_MAX"); + "[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX", shape_size, size); if (kernel_hccl_infos[0].hccl_type == HVDCALLBACKALLGATHER) { block_size = shape_size * size; } else { block_size = (input_size + align_size - 1) / align_size * align_size; } - GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero"); - GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID, "block_size:%ld is not divisiable by size:%d.", - block_size, size); + GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "[Check][Param] Size is zero"); + GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID, + "[Check][Param] block_size:%ld is not divisiable by size:%d.", block_size, size); kernel_hccl_infos[i].count = static_cast(block_size / size); } @@ -191,7 +199,8 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, Status ret; ret = CheckKernelHcclInfo(op_desc, kernel_hccl_infos); if (ret != SUCCESS) { - GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid."); + GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] failed, the number of GETaskKernelHcclInfo is invalid, op:%s(%s).", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); @@ -200,7 +209,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, ret = GetHcomCount(op_desc, static_cast(kernel_hccl_infos[0].dataType), kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count); if (ret != SUCCESS) { - GELOGE(ret, "HcomOmeUtil:: Node: %s Optype: %s get the Hcom operator hccl count fail.", + GELOGE(ret, "[Call][GetHcomCount] Node:%s Optype:%s get the Hcom operator hccl count fail.", op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } @@ -210,7 +219,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, if (IsHorovodOp(op_desc->GetType())) { ret = GetHorovodCount(op_desc, kernel_hccl_infos); if (ret != SUCCESS) { - GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s get the Horovod hccl operator count fail.", + GELOGE(PARAM_INVALID, "[Call][GetHorovodCount] Node:%s Optype:%s get the Horovod hccl operator count fail.", op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } @@ -225,11 +234,10 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl if (IsHCOMOp(op_desc->GetType())) { std::string hcom_op_type; GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), - REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", - HCOM_ATTR_REDUCE_TYPE.c_str(), + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", HCOM_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID, - "HcomOmeUtil:: Node: %s Optype: %s Get HCOM_ATTR_REDUCE_TYPE fail, not support!", + "[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); if (hcom_op_type == "min") { @@ -244,7 +252,9 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now, " "check invalid", HCOM_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), hcom_op_type.c_str()); - GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now", + HCOM_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), + op_desc->GetType().c_str(), hcom_op_type.c_str()); return PARAM_INVALID; } } @@ -256,7 +266,7 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID, - "HcomOmeUtil:: Node: %s Optype: %s Get ATTR_HOROVOD_ATTR_REDUCE_TYPE fail, not support!", + "[Get][Attr] %s in op:%s(%s) fail", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); auto iter = kHorovodRedOpToHcclRedOp.find(static_cast(horovod_op_type)); @@ -264,8 +274,8 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now, " "check invalid", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); - GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s HcomOpType cann't support! Current HcomOpType : %ld", - op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); + GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now", + ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); return PARAM_INVALID; } op_type = iter->second; @@ -281,7 +291,7 @@ Status HcomOmeUtil::GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &ro HCOM_ATTR_ROOT_RANK.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID, - "HcomOmeUtil::Node %s Optype: %s Get HCOM_ATTR_ROOT_INDEX fail, not support!", + "[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_ROOT_RANK.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); return SUCCESS; @@ -296,7 +306,7 @@ Status HcomOmeUtil::GetAllRootId(const ge::ConstOpDescPtr &op_desc, int64_t root_id = 0; Status dmrt = GetHcclRootId(op_desc, root_id); if (dmrt != SUCCESS) { - GELOGE(FAILED, "davinci_model: GetHcomRootId fail! domi error: %u", dmrt); + GELOGE(FAILED, "[Get][HcclRootId] fail! domi error: %u", dmrt); return FAILED; } @@ -324,7 +334,8 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc, REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1, " "check invalid", op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); - GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Hcom scenario, the number of GETaskKernelHcclInfo is invalid."); + GELOGE(PARAM_INVALID, "[Check][Param] Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); return PARAM_INVALID; } @@ -337,7 +348,9 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc, "in op:%s(%s), check invalid", kernel_hccl_infos.size(), op_desc->GetInputsSize(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); - GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Horovod scenario, the number of GETaskKernelHcclInfo is invalid."); + GELOGE(PARAM_INVALID, "Param kernel_hccl_infos.size:%zu is empty or not equal to " + "input_desc size:%zu in op:%s(%s)", kernel_hccl_infos.size(), op_desc->GetInputsSize(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } } @@ -360,7 +373,7 @@ Status HcomOmeUtil::GetHorovodInputs(const ge::ConstOpDescPtr &op_desc, } if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { - GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s the number of GETaskKernelHcclInfo is invalid.", + GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] Node:%s Optype:%s the number of GETaskKernelHcclInfo is invalid.", op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID; } diff --git a/ge/graph/manager/util/variable_accelerate_ctrl.cc b/ge/graph/manager/util/variable_accelerate_ctrl.cc index 22f9169c..12ec5a49 100644 --- a/ge/graph/manager/util/variable_accelerate_ctrl.cc +++ b/ge/graph/manager/util/variable_accelerate_ctrl.cc @@ -54,7 +54,7 @@ void VarAccelerateCtrl::SetVarChanged(const std::string &var_name) { void VarAccelerateCtrl::AddGraph(uint32_t graph_id, const ComputeGraphPtr &compute_graph) { std::lock_guard lock(mutex_); if (compute_graph == nullptr) { - GELOGE(PARAM_INVALID, "Failed to add graph %u, the compute graph is null", graph_id); + GELOGE(PARAM_INVALID, "[Check][Param] Failed to add graph %u, the compute graph is null", graph_id); return; } auto &var_names = graph_ids_to_var_names_[graph_id];