Browse Source

Profiling AR version 3

tags/v1.2.0
taoxiangdong 3 years ago
parent
commit
612c82f405
3 changed files with 119 additions and 28 deletions
  1. +51
    -23
      ge/common/profiling/ge_profiling.cc
  2. +51
    -5
      ge/common/profiling/profiling_manager.cc
  3. +17
    -0
      ge/common/profiling/profiling_manager.h

+ 51
- 23
ge/common/profiling/ge_profiling.cc View File

@@ -22,6 +22,7 @@
#include "graph/load/graph_loader.h"
#include "init/gelib.h"
#include "framework/common/ge_inner_error_codes.h"
#include "opskernel_manager/ops_kernel_builder_manager.h"

namespace {
const uint32_t kDeviceListIndex = 3;
@@ -44,7 +45,7 @@ const std::map<ProfCommandHandleType, std::string> kProfCommandTypeMap = {
{kProfCommandhandleModelUnsubscribe, kProfModelUnsubscribe}};
} // namespace

bool TransProfConfigToParam(const MsprofCommandHandle &profCommand, vector<string> &prof_config_params) {
bool TransProfConfigToParam(const ProfCommandHandleData &profCommand, vector<string> &prof_config_params) {
prof_config_params.clear();
prof_config_params.emplace_back(kDeviceNums);
prof_config_params.emplace_back(std::to_string(profCommand.devNums));
@@ -71,7 +72,7 @@ bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
return false;
}
if (device_nums == 0 || device_nums > MAX_DEV_NUM) {
GELOGE(ge::PARAM_INVALID, "The device nums is invalid.");
GELOGE(ge::PARAM_INVALID, "The device nums: %u is invalid.", device_nums);
return false;
}

@@ -105,16 +106,26 @@ bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
}

ge::Status RegProfCtrlCallback(MsprofCtrlCallback func) {
if (func == nullptr) {
GELOGE(ge::PARAM_INVALID, "Msprof ctrl callback is nullptr.");
return ge::PARAM_INVALID;
}
if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) {
GELOGW("Msprof ctrl callback is exist, just ignore it.");
} else {
GELOGI("GE register Msprof ctrl callback.");
ge::ProfilingManager::Instance().SetMsprofCtrlCallback(func);
}
return ge::SUCCESS;
}

ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) {
if (func == nullptr) {
GELOGE(ge::PARAM_INVALID, "MsprofSetDeviceCallback callback is nullptr.");
return ge::PARAM_INVALID;
}
// Pass MsprofSetDeviceCallback to runtime
GELOGI("GE pass setdevice callback to runtime.");
ge::Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName.c_str(), static_cast<rtDeviceStateCallback>(func));
if (rt_ret != ge::SUCCESS) {
GELOGE(rt_ret, "Pass MsprofSetDeviceCallback to runtime failed!");
@@ -124,54 +135,71 @@ ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) {
}

ge::Status RegProfReporterCallback(MsprofReporterCallback func) {
if (func == nullptr) {
GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
return ge::PARAM_INVALID;
}
if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) {
GELOGW("Msprof reporter callback is exist, just ignore it.");
} else {
GELOGI("GE register Msprof reporter callback.");
ge::ProfilingManager::Instance().SetMsprofReporterCallback(func);
// Pass MsprofReporterCallback to runtime
ge::Status rt_ret = rtSetMsprofReporterCallback(func);
if (rt_ret != ge::SUCCESS) {
GELOGE(rt_ret, "Pass MsprofReporterCallback to runtime failed!!");
return rt_ret;
}
// Pass MsprofReporterCallback to hccl in opskernel so initialize
rt_ret = OpskernelBuilderManager::Instance().RegProfReporterCallback(func);
if (rt_ret != ge::SUCCESS) {
GELOGE(rt_ret, "Pass MsprofReporterCallback to hccl failed.");
return rt_ret;
}
}
// Pass MsprofReporterCallback to runtime
ge::Status rt_ret = rtSetMsprofReporterCallback(func);
if (rt_ret != ge::SUCCESS) {
GELOGE(rt_ret, "Pass MsprofReporterCallback to runtime failed!!");
return rt_ret;
}
// Pass MsprofReporterCallback to hccl in opskernel so initialize

return ge::SUCCESS;
}

ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) {
GE_CHECK_NOTNULL(data);
MsprofCommandHandle *prof_config_param = (MsprofCommandHandle *)data;
if (!isProfConfigValid(prof_config_param->devIdList, prof_config_param->devNums)) {
return ge::FAILED;
}
std::vector<string> prof_params;
if (!TransProfConfigToParam(*prof_config_param, prof_params)) {
GELOGE(ge::PARAM_INVALID, "Transfer profilerConfig to string vector failed");
return ge::PARAM_INVALID;
if (type != kProfCommandhandleFinalize) {
GE_CHECK_NOTNULL(data);
}
MsprofCommandHandleData *prof_config_param = (MsprofCommandHandleData *)data;
auto iter = kProfCommandTypeMap.find(type);
if (iter == kProfCommandTypeMap.end()) {
GELOGW("The prof comand type is invalid.");
return ge::PARAM_INVALID;
}
std::vector<string> prof_params;
if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) {
if (!isProfConfigValid(prof_config_param->devIdList, prof_config_param->devNums)) {
return ge::FAILED;
}
if (!TransProfConfigToParam(*prof_config_param, prof_params)) {
GELOGE(ge::PARAM_INVALID, "Transfer profilerConfig to string vector failed");
return ge::PARAM_INVALID;
}
}
ge::GraphLoader graph_loader;
ge::Command command;
command.cmd_params.clear();
command.cmd_type = iter->second;
command.cmd_params = prof_params;
command.module_index = prof_config_param->profSwitch;
GELOGI("GE commandhandle execute, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(),
prof_params[kDeviceListIndex].c_str(), command.module_index);
if (type != kProfCommandhandleFinalize) {
command.module_index = prof_config_param->profSwitch;
}
GELOGI("GE commandhandle execute, Command Type: %d, data type config: 0x%llx", type, command.module_index);
if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) {
GELOGI("Profiling device nums:%s , deviceID:[%s]", prof_params[0].c_str(), prof_params[kDeviceListIndex].c_str());
}
ge::Status ret = graph_loader.CommandHandle(command);
if (ret != ge::SUCCESS) {
GELOGE(ret, "Handle profiling command failed");
return ge::FAILED;
}

GELOGI("Successfully execute profiling command 0x%llx.", command.module_index);
GELOGI("Successfully execute profiling command type: %d, command 0x%llx.", type, command.module_index);
return ge::SUCCESS;
}


+ 51
- 5
ge/common/profiling/profiling_manager.cc View File

@@ -22,6 +22,7 @@
#include "graph/ge_context.h"
#include "runtime/base.h"
#include "graph/load/new_model_manager/davinci_model.h"
#include "opskernel_manager/ops_kernel_builder_manager.h"

namespace {
const char *const kTrainingTrace = "training_trace";
@@ -41,7 +42,10 @@ namespace ge {
ProfilingManager::ProfilingManager() : is_load_profiling_(false),
is_execute_profiling_(false),
is_training_trace_(false),
subscribe_count_(0) {}
subscribe_count_(0) {
prof_cb_.msprofCtrlCallback = nullptr;
prof_cb_.msprofReporterCallback = nullptr;
}

ProfilingManager::~ProfilingManager() {}

@@ -64,6 +68,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In
}

if (is_execute_profiling_) {
if (prof_cb_.msprofCtrlCallback == nullptr) {
GELOGE(ge::PARAM_INVALID, "MsprofCtrlCallback callback is nullptr.");
return ge::PARAM_INVALID;
}
int32_t cb_ret = prof_cb_.msprofCtrlCallback(
static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
static_cast<void *>(&prof_conf), sizeof(MsprofGeOptions));
@@ -116,7 +124,7 @@ ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOpt
// Parse json str for bp fp
Status ret = ParseOptions(prof_conf.options);
if (ret != ge::SUCCESS) {
GELOGE(ge::PARAM_INVALID, "Parse taining trace param failed.");
GELOGE(ge::PARAM_INVALID, "Parse training trace param failed.");
return ge::PARAM_INVALID;
}

@@ -182,6 +190,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProf
}
// stop profiling
if (prof_cb_.msprofCtrlCallback == nullptr) {
GELOGE(ge::PARAM_INVALID, "MsprofCtrlCallback callback is nullptr.");
return ge::PARAM_INVALID;
}
int32_t cb_ret = prof_cb_.msprofCtrlCallback(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE),
nullptr, 0);
if (cb_ret != 0) {
@@ -210,7 +222,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
.append(std::to_string(stream_id)).append(" ")
.append(std::to_string(model_id)).append("\n"));

ReporterData reporter_data;
ReporterData reporter_data{};
reporter_data.deviceId = device_id;
reporter_data.data = (unsigned char *)data.c_str();
reporter_data.dataLen = data.size();
@@ -298,7 +310,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin

void ProfilingManager::GraphDescReport(const int32_t &device_id, const string &data) {
#ifdef DAVINCI_SUPPORT_PROFILING
ReporterData reporter_data;
ReporterData reporter_data{};
int ret = -1;
int32_t cb_ret = -1;
size_t index = data.size() / kReportMaxLen;
@@ -487,6 +499,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfIn
GELOGE(FAILED, "Runtime profiler start failed.");
return FAILED;
}
Status hccl_ret = OpskernelBuilderManager::Instance().ProfStart(model_load_mask);
if (hccl_ret != SUCCESS) {
GELOGE(FAILED, "Hccl profiler start failed.");
return FAILED;
}
is_load_profiling_ = true;
GELOGI("Prof init: model load profiling on.");
}
@@ -517,6 +534,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfFi
return FAILED;
}

Status hccl_ret = OpskernelBuilderManager::Instance().ProfStop(PROF_MODEL_LOAD_MASK);
if (hccl_ret != SUCCESS) {
GELOGE(FAILED, "Hccl profiler stop failed.");
return FAILED;
}
for (auto device_id_module : device_id_module_map_) {
if (device_id_module.second != 0) {
uint32_t device_id = static_cast<uint32_t>(device_id_module.first);
@@ -640,6 +662,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
GELOGE(FAILED, "Runtime profiler config proc failed.");
return FAILED;
}
Status hccl_ret = OpskernelBuilderManager::Instance().ProfStart(module);
if (hccl_ret != SUCCESS) {
GELOGE(FAILED, "Hccl profiler start failed.");
return FAILED;
}
if ((module & PROF_MODEL_EXECUTE_MASK) == PROF_MODEL_EXECUTE_MASK) {
for (int32_t i = 0; i < device_num; i++) {
if (std::find(device_id_.begin(), device_id_.end(), device_list[i]) == device_id_.end()) {
@@ -681,6 +709,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
GELOGE(FAILED, "Prof stop: runtime profiler config proc failed.");
return FAILED;
}
Status hccl_ret = OpskernelBuilderManager::Instance().ProfStop(module);
if (hccl_ret != SUCCESS) {
GELOGE(FAILED, "Hccl profiler stop failed.");
return FAILED;
}
uint64_t execute_model_mask = module & PROF_MODEL_EXECUTE_MASK;
if (execute_model_mask == PROF_MODEL_EXECUTE_MASK) {
for (int32_t i = 0; i < device_num; i++) {
@@ -749,6 +782,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::Profilin
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::PluginInit() const {
if (prof_cb_.msprofReporterCallback == nullptr) {
GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
return ge::PARAM_INVALID;
}
return prof_cb_.msprofReporterCallback(
static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_INIT),
@@ -757,6 +794,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::Plugin

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUnInit() const {
#ifdef DAVINCI_SUPPORT_PROFILING
if (prof_cb_.msprofReporterCallback == nullptr) {
GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
return ge::PARAM_INVALID;
}
int32_t cb_ret = prof_cb_.msprofReporterCallback(
static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT),
@@ -769,6 +810,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUn

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::CallMsprofReport(
ReporterData &reporter_data) const {
if (prof_cb_.msprofReporterCallback == nullptr) {
GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
return ge::PARAM_INVALID;
}
return prof_cb_.msprofReporterCallback(
static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT),
@@ -806,8 +851,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::GetFpBpP
}
} catch (...) {
GELOGE(FAILED, "Json prof options is invalid.");
return ge::PARAM_INVALID;
return;
}
return;
}




+ 17
- 0
ge/common/profiling/profiling_manager.h View File

@@ -35,6 +35,23 @@ using Json = nlohmann::json;

namespace {
const std::string GE_PROFILING_MODULE = "Framework";
// DataTypeConfig MASK
#define PROF_ACL_API_MASK 0x0001
#define PROF_TASK_TIME_MASK 0x0002
#define PROF_AICORE_METRICS_MASK 0x0004
#define PROF_AICPU_TRACE_MASK 0x0008
#define PROF_MODEL_EXECUTE_MASK 0x0010
#define PROF_RUNTIME_API_MASK 0x0020
#define PROF_RUNTIME_TRACE_MASK 0x0040
#define PROF_SCHEDULE_TIMELINE_MASK 0x0080
#define PROF_SCHEDULE_TRACE_MASK 0x0100
#define PROF_AIVECTORCORE_METRICS_MASK 0x0200
#define PROF_SUBTASK_TIME_MASK 0x0400
#define PROF_TRAINING_TRACE_MASK 0x0800
#define PROF_HCCL_TRACE_MASK 0x1000
#define PROF_DATA_PROCESS_MASK 0x2000
#define PROF_MODEL_LOAD_MASK 0x8000000000000000

} // namespace
namespace ge {
struct DeviceSubsInfo {


Loading…
Cancel
Save