From 612c82f405cdde94b26e538d58942e94dda5d9fd Mon Sep 17 00:00:00 2001 From: taoxiangdong Date: Thu, 10 Dec 2020 17:41:02 +0800 Subject: [PATCH] Profiling AR version 3 --- ge/common/profiling/ge_profiling.cc | 74 ++++++++++++++++-------- ge/common/profiling/profiling_manager.cc | 56 ++++++++++++++++-- ge/common/profiling/profiling_manager.h | 17 ++++++ 3 files changed, 119 insertions(+), 28 deletions(-) diff --git a/ge/common/profiling/ge_profiling.cc b/ge/common/profiling/ge_profiling.cc index bab699cc..4fc61ca0 100644 --- a/ge/common/profiling/ge_profiling.cc +++ b/ge/common/profiling/ge_profiling.cc @@ -22,6 +22,7 @@ #include "graph/load/graph_loader.h" #include "init/gelib.h" #include "framework/common/ge_inner_error_codes.h" +#include "opskernel_manager/ops_kernel_builder_manager.h" namespace { const uint32_t kDeviceListIndex = 3; @@ -44,7 +45,7 @@ const std::map kProfCommandTypeMap = { {kProfCommandhandleModelUnsubscribe, kProfModelUnsubscribe}}; } // namespace -bool TransProfConfigToParam(const MsprofCommandHandle &profCommand, vector &prof_config_params) { +bool TransProfConfigToParam(const ProfCommandHandleData &profCommand, vector &prof_config_params) { prof_config_params.clear(); prof_config_params.emplace_back(kDeviceNums); prof_config_params.emplace_back(std::to_string(profCommand.devNums)); @@ -71,7 +72,7 @@ bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) { return false; } if (device_nums == 0 || device_nums > MAX_DEV_NUM) { - GELOGE(ge::PARAM_INVALID, "The device nums is invalid."); + GELOGE(ge::PARAM_INVALID, "The device nums: %u is invalid.", device_nums); return false; } @@ -105,16 +106,26 @@ bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) { } ge::Status RegProfCtrlCallback(MsprofCtrlCallback func) { + if (func == nullptr) { + GELOGE(ge::PARAM_INVALID, "Msprof ctrl callback is nullptr."); + return ge::PARAM_INVALID; + } if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) { GELOGW("Msprof ctrl callback is exist, just ignore it."); } else { + GELOGI("GE register Msprof ctrl callback."); ge::ProfilingManager::Instance().SetMsprofCtrlCallback(func); } return ge::SUCCESS; } ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { + if (func == nullptr) { + GELOGE(ge::PARAM_INVALID, "MsprofSetDeviceCallback callback is nullptr."); + return ge::PARAM_INVALID; + } // Pass MsprofSetDeviceCallback to runtime + GELOGI("GE pass setdevice callback to runtime."); ge::Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName.c_str(), static_cast(func)); if (rt_ret != ge::SUCCESS) { GELOGE(rt_ret, "Pass MsprofSetDeviceCallback to runtime failed!"); @@ -124,54 +135,71 @@ ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { } ge::Status RegProfReporterCallback(MsprofReporterCallback func) { + if (func == nullptr) { + GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr."); + return ge::PARAM_INVALID; + } if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) { GELOGW("Msprof reporter callback is exist, just ignore it."); } else { GELOGI("GE register Msprof reporter callback."); ge::ProfilingManager::Instance().SetMsprofReporterCallback(func); + // Pass MsprofReporterCallback to runtime + ge::Status rt_ret = rtSetMsprofReporterCallback(func); + if (rt_ret != ge::SUCCESS) { + GELOGE(rt_ret, "Pass MsprofReporterCallback to runtime failed!!"); + return rt_ret; + } + // Pass MsprofReporterCallback to hccl in opskernel so initialize + rt_ret = OpskernelBuilderManager::Instance().RegProfReporterCallback(func); + if (rt_ret != ge::SUCCESS) { + GELOGE(rt_ret, "Pass MsprofReporterCallback to hccl failed."); + return rt_ret; + } } - // Pass MsprofReporterCallback to runtime - ge::Status rt_ret = rtSetMsprofReporterCallback(func); - if (rt_ret != ge::SUCCESS) { - GELOGE(rt_ret, "Pass MsprofReporterCallback to runtime failed!!"); - return rt_ret; - } - // Pass MsprofReporterCallback to hccl in opskernel so initialize - return ge::SUCCESS; } ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) { - GE_CHECK_NOTNULL(data); - MsprofCommandHandle *prof_config_param = (MsprofCommandHandle *)data; - if (!isProfConfigValid(prof_config_param->devIdList, prof_config_param->devNums)) { - return ge::FAILED; - } - std::vector prof_params; - if (!TransProfConfigToParam(*prof_config_param, prof_params)) { - GELOGE(ge::PARAM_INVALID, "Transfer profilerConfig to string vector failed"); - return ge::PARAM_INVALID; + if (type != kProfCommandhandleFinalize) { + GE_CHECK_NOTNULL(data); } + MsprofCommandHandleData *prof_config_param = (MsprofCommandHandleData *)data; auto iter = kProfCommandTypeMap.find(type); if (iter == kProfCommandTypeMap.end()) { GELOGW("The prof comand type is invalid."); return ge::PARAM_INVALID; } + std::vector prof_params; + if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) { + if (!isProfConfigValid(prof_config_param->devIdList, prof_config_param->devNums)) { + return ge::FAILED; + } + + if (!TransProfConfigToParam(*prof_config_param, prof_params)) { + GELOGE(ge::PARAM_INVALID, "Transfer profilerConfig to string vector failed"); + return ge::PARAM_INVALID; + } + } ge::GraphLoader graph_loader; ge::Command command; command.cmd_params.clear(); command.cmd_type = iter->second; command.cmd_params = prof_params; - command.module_index = prof_config_param->profSwitch; - GELOGI("GE commandhandle execute, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(), - prof_params[kDeviceListIndex].c_str(), command.module_index); + if (type != kProfCommandhandleFinalize) { + command.module_index = prof_config_param->profSwitch; + } + GELOGI("GE commandhandle execute, Command Type: %d, data type config: 0x%llx", type, command.module_index); + if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) { + GELOGI("Profiling device nums:%s , deviceID:[%s]", prof_params[0].c_str(), prof_params[kDeviceListIndex].c_str()); + } ge::Status ret = graph_loader.CommandHandle(command); if (ret != ge::SUCCESS) { GELOGE(ret, "Handle profiling command failed"); return ge::FAILED; } - GELOGI("Successfully execute profiling command 0x%llx.", command.module_index); + GELOGI("Successfully execute profiling command type: %d, command 0x%llx.", type, command.module_index); return ge::SUCCESS; } diff --git a/ge/common/profiling/profiling_manager.cc b/ge/common/profiling/profiling_manager.cc index fd56f15d..bfc747b5 100644 --- a/ge/common/profiling/profiling_manager.cc +++ b/ge/common/profiling/profiling_manager.cc @@ -22,6 +22,7 @@ #include "graph/ge_context.h" #include "runtime/base.h" #include "graph/load/new_model_manager/davinci_model.h" +#include "opskernel_manager/ops_kernel_builder_manager.h" namespace { const char *const kTrainingTrace = "training_trace"; @@ -41,7 +42,10 @@ namespace ge { ProfilingManager::ProfilingManager() : is_load_profiling_(false), is_execute_profiling_(false), is_training_trace_(false), - subscribe_count_(0) {} + subscribe_count_(0) { + prof_cb_.msprofCtrlCallback = nullptr; + prof_cb_.msprofReporterCallback = nullptr; +} ProfilingManager::~ProfilingManager() {} @@ -64,6 +68,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In } if (is_execute_profiling_) { + if (prof_cb_.msprofCtrlCallback == nullptr) { + GELOGE(ge::PARAM_INVALID, "MsprofCtrlCallback callback is nullptr."); + return ge::PARAM_INVALID; + } int32_t cb_ret = prof_cb_.msprofCtrlCallback( static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), static_cast(&prof_conf), sizeof(MsprofGeOptions)); @@ -116,7 +124,7 @@ ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOpt // Parse json str for bp fp Status ret = ParseOptions(prof_conf.options); if (ret != ge::SUCCESS) { - GELOGE(ge::PARAM_INVALID, "Parse taining trace param failed."); + GELOGE(ge::PARAM_INVALID, "Parse training trace param failed."); return ge::PARAM_INVALID; } @@ -182,6 +190,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProf } // stop profiling + if (prof_cb_.msprofCtrlCallback == nullptr) { + GELOGE(ge::PARAM_INVALID, "MsprofCtrlCallback callback is nullptr."); + return ge::PARAM_INVALID; + } int32_t cb_ret = prof_cb_.msprofCtrlCallback(static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), nullptr, 0); if (cb_ret != 0) { @@ -210,7 +222,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin .append(std::to_string(stream_id)).append(" ") .append(std::to_string(model_id)).append("\n")); - ReporterData reporter_data; + ReporterData reporter_data{}; reporter_data.deviceId = device_id; reporter_data.data = (unsigned char *)data.c_str(); reporter_data.dataLen = data.size(); @@ -298,7 +310,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin void ProfilingManager::GraphDescReport(const int32_t &device_id, const string &data) { #ifdef DAVINCI_SUPPORT_PROFILING - ReporterData reporter_data; + ReporterData reporter_data{}; int ret = -1; int32_t cb_ret = -1; size_t index = data.size() / kReportMaxLen; @@ -487,6 +499,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfIn GELOGE(FAILED, "Runtime profiler start failed."); return FAILED; } + Status hccl_ret = OpskernelBuilderManager::Instance().ProfStart(model_load_mask); + if (hccl_ret != SUCCESS) { + GELOGE(FAILED, "Hccl profiler start failed."); + return FAILED; + } is_load_profiling_ = true; GELOGI("Prof init: model load profiling on."); } @@ -517,6 +534,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfFi return FAILED; } + Status hccl_ret = OpskernelBuilderManager::Instance().ProfStop(PROF_MODEL_LOAD_MASK); + if (hccl_ret != SUCCESS) { + GELOGE(FAILED, "Hccl profiler stop failed."); + return FAILED; + } for (auto device_id_module : device_id_module_map_) { if (device_id_module.second != 0) { uint32_t device_id = static_cast(device_id_module.first); @@ -640,6 +662,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt GELOGE(FAILED, "Runtime profiler config proc failed."); return FAILED; } + + Status hccl_ret = OpskernelBuilderManager::Instance().ProfStart(module); + if (hccl_ret != SUCCESS) { + GELOGE(FAILED, "Hccl profiler start failed."); + return FAILED; + } if ((module & PROF_MODEL_EXECUTE_MASK) == PROF_MODEL_EXECUTE_MASK) { for (int32_t i = 0; i < device_num; i++) { if (std::find(device_id_.begin(), device_id_.end(), device_list[i]) == device_id_.end()) { @@ -681,6 +709,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt GELOGE(FAILED, "Prof stop: runtime profiler config proc failed."); return FAILED; } + Status hccl_ret = OpskernelBuilderManager::Instance().ProfStop(module); + if (hccl_ret != SUCCESS) { + GELOGE(FAILED, "Hccl profiler stop failed."); + return FAILED; + } uint64_t execute_model_mask = module & PROF_MODEL_EXECUTE_MASK; if (execute_model_mask == PROF_MODEL_EXECUTE_MASK) { for (int32_t i = 0; i < device_num; i++) { @@ -749,6 +782,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::Profilin } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::PluginInit() const { + if (prof_cb_.msprofReporterCallback == nullptr) { + GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr."); + return ge::PARAM_INVALID; + } return prof_cb_.msprofReporterCallback( static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_INIT), @@ -757,6 +794,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::Plugin FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUnInit() const { #ifdef DAVINCI_SUPPORT_PROFILING + if (prof_cb_.msprofReporterCallback == nullptr) { + GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr."); + return ge::PARAM_INVALID; + } int32_t cb_ret = prof_cb_.msprofReporterCallback( static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT), @@ -769,6 +810,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUn FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::CallMsprofReport( ReporterData &reporter_data) const { + if (prof_cb_.msprofReporterCallback == nullptr) { + GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr."); + return ge::PARAM_INVALID; + } return prof_cb_.msprofReporterCallback( static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT), @@ -806,8 +851,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::GetFpBpP } } catch (...) { GELOGE(FAILED, "Json prof options is invalid."); - return ge::PARAM_INVALID; + return; } + return; } diff --git a/ge/common/profiling/profiling_manager.h b/ge/common/profiling/profiling_manager.h index c9ceed92..5fa4fac4 100755 --- a/ge/common/profiling/profiling_manager.h +++ b/ge/common/profiling/profiling_manager.h @@ -35,6 +35,23 @@ using Json = nlohmann::json; namespace { const std::string GE_PROFILING_MODULE = "Framework"; + // DataTypeConfig MASK + #define PROF_ACL_API_MASK 0x0001 + #define PROF_TASK_TIME_MASK 0x0002 + #define PROF_AICORE_METRICS_MASK 0x0004 + #define PROF_AICPU_TRACE_MASK 0x0008 + #define PROF_MODEL_EXECUTE_MASK 0x0010 + #define PROF_RUNTIME_API_MASK 0x0020 + #define PROF_RUNTIME_TRACE_MASK 0x0040 + #define PROF_SCHEDULE_TIMELINE_MASK 0x0080 + #define PROF_SCHEDULE_TRACE_MASK 0x0100 + #define PROF_AIVECTORCORE_METRICS_MASK 0x0200 + #define PROF_SUBTASK_TIME_MASK 0x0400 + #define PROF_TRAINING_TRACE_MASK 0x0800 + #define PROF_HCCL_TRACE_MASK 0x1000 + #define PROF_DATA_PROCESS_MASK 0x2000 + #define PROF_MODEL_LOAD_MASK 0x8000000000000000 + } // namespace namespace ge { struct DeviceSubsInfo {