From: @zhengyuanhua Reviewed-by: @xchu42,@ji_chen Signed-off-by: @ji_chentags/v1.2.0
@@ -218,6 +218,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin | |||
uint32_t stream_id = task.stream_id; | |||
std::string shape_type = task.shape_type; | |||
int64_t cur_iter_num = task.cur_iter_num; | |||
uint32_t task_type = task.task_type; | |||
data = model_name.append(" ") | |||
.append(op_name).append(" ") | |||
.append(std::to_string(block_dim)).append(" ") | |||
@@ -225,7 +226,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin | |||
.append(std::to_string(stream_id)).append(" ") | |||
.append(std::to_string(model_id)).append(" ") | |||
.append(shape_type).append(" ") | |||
.append(std::to_string(cur_iter_num)).append("\n"); | |||
.append(std::to_string(cur_iter_num)).append(" ") | |||
.append(std::to_string(task_type)).append("\n"); | |||
ReporterData reporter_data{}; | |||
reporter_data.deviceId = device_id; | |||
@@ -3065,6 +3065,65 @@ Status DavinciModel::MallocKnownArgs() { | |||
return SUCCESS; | |||
} | |||
void DavinciModel::SaveProfilingTaskDescInfo(const OpDescPtr &op, const TaskInfoPtr &task, | |||
const domi::TaskDef &task_def, size_t task_index) { | |||
task_desc_info_.clear(); | |||
bool flag = GetL1FusionEnableOption(); | |||
char skt_enable_env[MMPA_MAX_PATH] = { 0x00 }; | |||
INT32 res = mmGetEnv("SKT_ENABLE", skt_enable_env, MMPA_MAX_PATH); | |||
int64_t env_flag = (res == EN_OK) ? std::strtol(skt_enable_env, nullptr, kDecimal) : 0; | |||
if (env_flag != 0) { | |||
flag = true; | |||
} | |||
TaskDescInfo task_desc_info; | |||
if (!om_name_.empty()) { | |||
task_desc_info.model_name = om_name_; | |||
} else { | |||
task_desc_info.model_name = name_; | |||
} | |||
task_desc_info.op_name = op->GetName(); | |||
task_desc_info.block_dim = task_def.kernel().block_dim(); | |||
task_desc_info.task_id = task->GetTaskID(); | |||
task_desc_info.stream_id = task->GetStreamId(); | |||
task_desc_info.shape_type = "static"; | |||
task_desc_info.cur_iter_num = 0; | |||
// task type | |||
task_desc_info.task_type = kTaskTypeInvalid; | |||
auto model_task_type = static_cast<rtModelTaskType_t>(task_def.type()); | |||
if (model_task_type == RT_MODEL_TASK_KERNEL) { | |||
const domi::KernelDef &kernel_def = task_def.kernel(); | |||
const auto &context = kernel_def.context(); | |||
auto kernel_type = static_cast<ccKernelType>(context.kernel_type()); | |||
if (kernel_type == ccKernelType::TE) { | |||
task_desc_info.task_type = kTaskTypeAicore; | |||
} else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) { | |||
task_desc_info.task_type = kTaskTypeAicpu; | |||
} else { | |||
GELOGD("Other kernel type: %u", context.kernel_type()); | |||
} | |||
} else if (model_task_type == RT_MODEL_TASK_KERNEL_EX) { | |||
task_desc_info.task_type = kTaskTypeAicpu; | |||
} else { | |||
GELOGD("Skip task type: %d", static_cast<int>(model_task_type)); | |||
} | |||
profiler_report_op_info_[task_desc_info.op_name] = | |||
std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id); | |||
task_desc_info_.emplace_back(task_desc_info); | |||
if (flag) { | |||
if (task->GetSktTaskID() != 0xFFFFFFFF) { | |||
TaskDescInfo task_desc_info; | |||
string op_name = "super_kernel_" + to_string(task_index); | |||
task_desc_info.op_name = op_name; | |||
task_desc_info.task_id = task->GetSktTaskID(); | |||
profiler_report_op_info_[task_desc_info.op_name] = | |||
std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id); | |||
task_desc_info_.emplace_back(task_desc_info); | |||
} | |||
} | |||
return; | |||
} | |||
Status DavinciModel::DistributeTask() { | |||
GELOGI("do Distribute."); | |||
for (auto &task : cpu_task_list_) { | |||
@@ -3075,19 +3134,11 @@ Status DavinciModel::DistributeTask() { | |||
GE_CHK_STATUS_RET(task->Distribute()); | |||
} | |||
task_desc_info_.clear(); | |||
bool flag = GetL1FusionEnableOption(); | |||
char skt_enable_env[MMPA_MAX_PATH] = { 0x00 }; | |||
INT32 res = mmGetEnv("SKT_ENABLE", skt_enable_env, MMPA_MAX_PATH); | |||
int64_t env_flag = (res == EN_OK) ? std::strtol(skt_enable_env, nullptr, kDecimal) : 0; | |||
if (env_flag != 0) { | |||
flag = true; | |||
} | |||
const auto &model_task_def = ge_model_->GetModelTaskDefPtr(); | |||
for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) { | |||
auto &task_def = model_task_def->task(task_index); | |||
auto &task = task_list_.at(task_index); | |||
GE_CHECK_NOTNULL(task); | |||
GE_CHK_STATUS_RET(task->Distribute(), "Task[%zu] distribute fail", task_index); | |||
// for data dump | |||
auto op_index = std::max(task_def.kernel().context().op_index(), | |||
@@ -3107,33 +3158,9 @@ Status DavinciModel::DistributeTask() { | |||
GE_IF_BOOL_EXEC(no_need_profiling, continue); | |||
SaveDumpOpInfo(runtime_param_, op, task->GetTaskID(), task->GetStreamId()); | |||
// Load task info for profiling | |||
TaskDescInfo task_desc_info; | |||
if (!om_name_.empty()) { | |||
task_desc_info.model_name = om_name_; | |||
} else { | |||
task_desc_info.model_name = name_; | |||
} | |||
task_desc_info.op_name = op->GetName(); | |||
task_desc_info.block_dim = task_def.kernel().block_dim(); | |||
task_desc_info.task_id = task->GetTaskID(); | |||
task_desc_info.stream_id = task->GetStreamId(); | |||
task_desc_info.shape_type = "static"; | |||
task_desc_info.cur_iter_num = 0; | |||
profiler_report_op_info_[task_desc_info.op_name] = | |||
std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id); | |||
task_desc_info_.emplace_back(task_desc_info); | |||
if (flag) { | |||
if (task->GetSktTaskID() != 0xFFFFFFFF) { | |||
TaskDescInfo task_desc_info; | |||
string op_name = "super_kernel_" + to_string(task_index); | |||
task_desc_info.op_name = op_name; | |||
task_desc_info.task_id = task->GetSktTaskID(); | |||
profiler_report_op_info_[task_desc_info.op_name] = | |||
std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id); | |||
task_desc_info_.emplace_back(task_desc_info); | |||
} | |||
} | |||
// save task info for profiling | |||
SaveProfilingTaskDescInfo(op, task, task_def, task_index); | |||
} | |||
// launch dump kernel to aicpu | |||
GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "Load dump info failed."); | |||
@@ -623,6 +623,9 @@ class DavinciModel { | |||
Status DistributeTask(); | |||
void SaveProfilingTaskDescInfo(const OpDescPtr &op, const TaskInfoPtr &task, | |||
const domi::TaskDef &task_def, size_t task_index); | |||
uint8_t *MallocFeatureMapMem(size_t data_size); | |||
uint8_t *MallocWeightsMem(size_t weights_size); | |||
@@ -159,27 +159,9 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel * | |||
} | |||
GELOGD("GetTaskDescInfo of node [%s] start.", node->GetName().c_str()); | |||
auto op_desc = node->GetOpDesc(); | |||
std::string op_name = op_desc->GetName(); | |||
std::string dynamic_model_name = model->GetModelName(); | |||
uint32_t task_id = context_->GetTaskId(); | |||
uint32_t stream_id = context_->GetStreamId(); | |||
TaskDescInfo tmp_task_desc_info; | |||
tmp_task_desc_info.model_name = dynamic_model_name; | |||
tmp_task_desc_info.op_name = op_name; | |||
tmp_task_desc_info.block_dim = 0; | |||
auto task_defs = model->GetTaskDefs(node); | |||
if (task_defs != nullptr && (*task_defs).size() > 0) { | |||
const auto &task_def = (*task_defs)[0]; | |||
tmp_task_desc_info.block_dim = task_def.kernel().block_dim(); | |||
} | |||
tmp_task_desc_info.task_id = task_id; | |||
tmp_task_desc_info.stream_id = stream_id; | |||
tmp_task_desc_info.shape_type = "dynamic"; | |||
tmp_task_desc_info.cur_iter_num = graph_context_->iteration; | |||
GELOGD("GetTaskDescInfo of node [%s] end, task_id[%u], stream_id[%u]", | |||
node->GetName().c_str(), task_id, stream_id); | |||
task_desc_info.emplace_back(tmp_task_desc_info); | |||
task_desc_info = context_->GetProfilingTaskDescInfo(); | |||
context_->ClearProfilingTaskDescInfo(); | |||
return SUCCESS; | |||
} | |||
@@ -247,7 +229,6 @@ Status NodeDoneCallback::ProfilingReport() { | |||
GELOGD("ProfilingReport of node [%s] model [%s] start.", node->GetName().c_str(), model->GetModelName().c_str()); | |||
std::vector<TaskDescInfo> task_desc_info; | |||
TaskDescInfo tmp_task_desc_info; | |||
auto profiling_ret = GetTaskDescInfo(node, model, task_desc_info); | |||
if (profiling_ret != RT_ERROR_NONE) { | |||
GELOGE(profiling_ret, "Get task info of node[%s] failed.", node->GetName().c_str()); | |||
@@ -182,16 +182,8 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> | |||
} | |||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] Start"); | |||
GE_CHK_STATUS_RET_NOLOG((*it)->LaunchKernel(context.GetStream())); | |||
uint32_t task_id = 0; | |||
uint32_t stream_id = 0; | |||
rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
GELOGE(rt_ret, "Get task_id and stream_id failed."); | |||
return rt_ret; | |||
} | |||
context.SetTaskId(task_id); | |||
context.SetStreamId(stream_id); | |||
GELOGD("AiCore node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id); | |||
// save profiling data | |||
(void)context.SaveProfilingTaskDescInfo(kTaskTypeAicore, (*it)->GetBlockDim()); | |||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End"); | |||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End"); | |||
} | |||
@@ -48,6 +48,8 @@ class AiCoreOpTask { | |||
bool GetClearAtomic() const {return clear_atomic_;} | |||
uint32_t GetBlockDim() const {return block_dim_;} | |||
protected: | |||
Status UpdateTilingInfo(TaskContext &context); | |||
virtual std::string GetKeyForOpParamSize() const; | |||
@@ -190,16 +190,8 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void( | |||
HYBRID_CHK_STATUS_RET(LaunchTask(context), "[%s] Failed to launch task", node_name_.c_str()); | |||
uint32_t task_id = 0; | |||
uint32_t stream_id = 0; | |||
rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
GELOGE(rt_ret, "Get task_id and stream_id failed."); | |||
return rt_ret; | |||
} | |||
context.SetTaskId(task_id); | |||
context.SetStreamId(stream_id); | |||
GELOGD("AiCpu node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id); | |||
// save profiling data | |||
(void)context.SaveProfilingTaskDescInfo(kTaskTypeAicpu, 0); | |||
auto callback = [=, &context]() { | |||
GELOGD("Node[%s] callback start.", node_name_.c_str()); | |||
@@ -21,6 +21,7 @@ | |||
#include "graph/debug/ge_attr_define.h" | |||
#include "hybrid/executor/hybrid_execution_context.h" | |||
#include "hybrid/executor/subgraph_executor.h" | |||
#include "common/profiling/profiling_manager.h" | |||
namespace ge { | |||
namespace hybrid { | |||
@@ -498,5 +499,42 @@ bool TaskContext::NeedCallback() { | |||
Status TaskContext::Synchronize() { | |||
return execution_context_->Synchronize(GetStream()); | |||
} | |||
Status TaskContext::SaveProfilingTaskDescInfo(uint32_t task_type, uint32_t block_dim) { | |||
if (ProfilingManager::Instance().ProfilingModelExecuteOn()) { | |||
const NodeItem &node_item = GetNodeItem(); | |||
auto op_desc = node_item.GetOpDesc(); | |||
GE_CHECK_NOTNULL(op_desc); | |||
uint32_t task_id = 0; | |||
uint32_t stream_id = 0; | |||
rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); // must be called after Launch kernel | |||
if (rt_ret != RT_ERROR_NONE) { | |||
GELOGE(rt_ret, "Get task_id and stream_id failed."); | |||
return rt_ret; | |||
} | |||
GELOGD("Node[%s] task_id: %u, stream_id: %u.", GetNodeName(), task_id, stream_id); | |||
const GraphExecutionContext * graph_context = GetExecutionContext(); | |||
GE_CHECK_NOTNULL(graph_context); | |||
const HybridModel *model = graph_context->model; | |||
GE_CHECK_NOTNULL(model); | |||
std::string op_name = op_desc->GetName(); | |||
std::string dynamic_model_name = model->GetModelName(); | |||
TaskDescInfo tmp_task_desc_info; | |||
tmp_task_desc_info.model_name = dynamic_model_name; | |||
tmp_task_desc_info.op_name = op_name; | |||
tmp_task_desc_info.block_dim = block_dim; | |||
tmp_task_desc_info.task_type = task_type; | |||
tmp_task_desc_info.task_id = task_id; | |||
tmp_task_desc_info.stream_id = stream_id; | |||
tmp_task_desc_info.shape_type = "dynamic"; | |||
tmp_task_desc_info.cur_iter_num = iteration_; | |||
task_desc_info.emplace_back(tmp_task_desc_info); | |||
} | |||
return SUCCESS; | |||
} | |||
} // namespace hybrid | |||
} // namespace ge |
@@ -22,6 +22,7 @@ | |||
#include <vector> | |||
#include "common/properties_manager.h" | |||
#include "external/ge/ge_api_error_codes.h" | |||
#include "framework/common/ge_types.h" | |||
#include "hybrid/common/tensor_value.h" | |||
#include "hybrid/common/npu_memory_allocator.h" | |||
#include "hybrid/executor/rt_callback_manager.h" | |||
@@ -108,6 +109,10 @@ class TaskContext { | |||
void SetForceInferShape(bool force_infer_shape); | |||
void *handle_ = nullptr; | |||
const std::vector<TaskDescInfo>& GetProfilingTaskDescInfo() const { return task_desc_info; } | |||
Status SaveProfilingTaskDescInfo(uint32_t task_type, uint32_t block_dim); | |||
void ClearProfilingTaskDescInfo() { task_desc_info.clear(); } | |||
private: | |||
TaskContext(GraphExecutionContext *execution_context, | |||
const NodeItem *node_item, | |||
@@ -127,6 +132,7 @@ class TaskContext { | |||
uint64_t iteration_ = 0; | |||
uint32_t task_id_ = 0; | |||
uint32_t stream_id_ = 0; | |||
std::vector<TaskDescInfo> task_desc_info; | |||
}; | |||
} // namespace hybrid | |||
} // namespace ge | |||
@@ -70,6 +70,7 @@ Status ProfilingTaskInfo(OpTask *op_task, const string &shape_type) { | |||
tmp_task_desc_info.stream_id = stream_id; | |||
tmp_task_desc_info.shape_type = shape_type; | |||
tmp_task_desc_info.cur_iter_num = 0; | |||
tmp_task_desc_info.task_type = op_task->GetTaskType(); | |||
GELOGD("GetTaskDescInfo of op [%s] end, task_id[%u], stream_id[%u]", op_name.c_str(), task_id, stream_id); | |||
task_desc_info.emplace_back(tmp_task_desc_info); | |||
@@ -145,6 +145,8 @@ Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc, | |||
return UNSUPPORTED; | |||
} | |||
uint32_t OpTask::GetTaskType() const { return kTaskTypeInvalid; } | |||
TbeOpTask::~TbeOpTask() { | |||
if (sm_desc_ != nullptr) { | |||
(void)rtMemFreeManaged(sm_desc_); | |||
@@ -161,6 +163,8 @@ size_t TbeOpTask::GetArgSize() const { return arg_size_; } | |||
const std::string &TbeOpTask::GetStubName() const { return stub_name_; } | |||
uint32_t TbeOpTask::GetTaskType() const { return kTaskTypeAicore; } | |||
Status TbeOpTask::LaunchKernel(rtStream_t stream) { | |||
GELOGD("To invoke rtKernelLaunch. task = %s, block_dim = %u", this->stub_name_.c_str(), block_dim_); | |||
auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_); | |||
@@ -802,6 +806,8 @@ Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam ¶m) { | |||
return DoUpdateArgTable(param, false); | |||
} | |||
uint32_t AiCpuBaseTask::GetTaskType() const { return kTaskTypeAicpu; } | |||
void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||
arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data()); | |||
arg_count = io_addr_host_.size(); | |||
@@ -52,6 +52,7 @@ class OpTask { | |||
std::vector<GeTensorDesc> &output_desc, | |||
std::vector<DataBuffer> &output_buffers, | |||
rtStream_t stream); | |||
virtual uint32_t GetTaskType() const; | |||
protected: | |||
Status DoUpdateArgTable(const SingleOpModelParam ¶m, bool keep_workspace); | |||
@@ -85,6 +86,7 @@ class TbeOpTask : public OpTask { | |||
size_t GetArgSize() const; | |||
const std::string &GetStubName() const; | |||
void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size); | |||
uint32_t GetTaskType() const override; | |||
private: | |||
friend class SingleOpModel; | |||
@@ -113,6 +115,8 @@ class AiCpuBaseTask : public OpTask { | |||
~AiCpuBaseTask() override; | |||
UnknowShapeOpType GetUnknownType() const { return unknown_type_; } | |||
Status UpdateArgTable(const SingleOpModelParam ¶m) override; | |||
uint32_t GetTaskType() const override; | |||
protected: | |||
Status UpdateIoAddr(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | |||
Status SetInputConst(); | |||
@@ -41,12 +41,7 @@ enum FrameworkType { | |||
}; | |||
const std::map<std::string, std::string> kFwkTypeToStr = { | |||
{"0", "Caffe"}, | |||
{"1", "MindSpore"}, | |||
{"3", "TensorFlow"}, | |||
{"4", "Android_NN"}, | |||
{"5", "Onnx"} | |||
}; | |||
{"0", "Caffe"}, {"1", "MindSpore"}, {"3", "TensorFlow"}, {"4", "Android_NN"}, {"5", "Onnx"}}; | |||
enum OpEngineType { | |||
ENGINE_SYS = 0, // default engine | |||
@@ -61,6 +56,11 @@ enum InputAippType { DATA_WITHOUT_AIPP = 0, DATA_WITH_STATIC_AIPP, DATA_WITH_DYN | |||
const char *const GE_ENGINE_ATTR_MEM_TYPE_HBM = "HBM"; | |||
const char *const GE_OPTION_EXEC_PLACEMENT = "ge.exec.placement"; | |||
// profiling data | |||
const uint32_t kTaskTypeAicore = 0; | |||
const uint32_t kTaskTypeAicpu = 1; | |||
const uint32_t kTaskTypeInvalid = 0xFFFF; | |||
// Data cache, including data address and length | |||
struct DataBuffer { | |||
public: | |||
@@ -256,6 +256,7 @@ struct TaskDescInfo { | |||
uint32_t stream_id; | |||
std::string shape_type; | |||
int64_t cur_iter_num; | |||
uint32_t task_type; | |||
}; | |||
// Profiling info of graph | |||
@@ -1 +1 @@ | |||
Subproject commit dc6cceb67bc82b567bcbd6f415776644253e1467 | |||
Subproject commit b00c50c2a8c2ce06929b27f7b74185a950737ec8 |
@@ -1 +1 @@ | |||
Subproject commit 4e72aae41e78af1a19cd965da4a45cbd988b9a75 | |||
Subproject commit f0109a2c70981d74932bb38bb56722caff3323a5 |