!968 profiling task desc info

From: @zhengyuanhua Reviewed-by: @xchu42,@ji_chen Signed-off-by: @ji_chen
3 years ago · 7bb0eeb614
--- a/ge/common/profiling/profiling_manager.cc
+++ b/ge/common/profiling/profiling_manager.cc
@@ -218,6 +218,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
    uint32_t stream_id = task.stream_id;
    std::string shape_type = task.shape_type;
    int64_t cur_iter_num = task.cur_iter_num;
    uint32_t task_type = task.task_type;
    data = model_name.append(" ")
                     .append(op_name).append(" ")
                     .append(std::to_string(block_dim)).append(" ")
@@ -225,7 +226,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
                     .append(std::to_string(stream_id)).append(" ")
                     .append(std::to_string(model_id)).append(" ")
                     .append(shape_type).append(" ")
                     .append(std::to_string(cur_iter_num)).append("\n");
                     .append(std::to_string(cur_iter_num)).append(" ")
                     .append(std::to_string(task_type)).append("\n");

    ReporterData reporter_data{};
    reporter_data.deviceId = device_id;
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
@@ -3065,6 +3065,65 @@ Status DavinciModel::MallocKnownArgs() {
  return SUCCESS;
 }

 void DavinciModel::SaveProfilingTaskDescInfo(const OpDescPtr &op, const TaskInfoPtr &task,
                                             const domi::TaskDef &task_def, size_t task_index) {
  task_desc_info_.clear();
  bool flag = GetL1FusionEnableOption();
  char skt_enable_env[MMPA_MAX_PATH] = { 0x00 };
  INT32 res = mmGetEnv("SKT_ENABLE", skt_enable_env, MMPA_MAX_PATH);
  int64_t env_flag = (res == EN_OK) ? std::strtol(skt_enable_env, nullptr, kDecimal) : 0;
  if (env_flag != 0) {
    flag = true;
  }

  TaskDescInfo task_desc_info;
  if (!om_name_.empty()) {
    task_desc_info.model_name = om_name_;
  } else {
    task_desc_info.model_name = name_;
  }
  task_desc_info.op_name = op->GetName();
  task_desc_info.block_dim = task_def.kernel().block_dim();
  task_desc_info.task_id = task->GetTaskID();
  task_desc_info.stream_id = task->GetStreamId();
  task_desc_info.shape_type = "static";
  task_desc_info.cur_iter_num = 0;
  // task type
  task_desc_info.task_type = kTaskTypeInvalid;
  auto model_task_type = static_cast<rtModelTaskType_t>(task_def.type());
  if (model_task_type == RT_MODEL_TASK_KERNEL) {
    const domi::KernelDef &kernel_def = task_def.kernel();
    const auto &context = kernel_def.context();
    auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
    if (kernel_type == ccKernelType::TE) {
      task_desc_info.task_type = kTaskTypeAicore;
    } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
      task_desc_info.task_type = kTaskTypeAicpu;
    } else {
      GELOGD("Other kernel type: %u", context.kernel_type());
    }
  } else if (model_task_type == RT_MODEL_TASK_KERNEL_EX) {
    task_desc_info.task_type = kTaskTypeAicpu;
  } else {
    GELOGD("Skip task type: %d", static_cast<int>(model_task_type));
  }
  profiler_report_op_info_[task_desc_info.op_name] =
    std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id);
  task_desc_info_.emplace_back(task_desc_info);
  if (flag) {
    if (task->GetSktTaskID() != 0xFFFFFFFF) {
      TaskDescInfo task_desc_info;
      string op_name = "super_kernel_" + to_string(task_index);
      task_desc_info.op_name = op_name;
      task_desc_info.task_id = task->GetSktTaskID();
      profiler_report_op_info_[task_desc_info.op_name] =
        std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id);
      task_desc_info_.emplace_back(task_desc_info);
    }
  }
  return;
 }

 Status DavinciModel::DistributeTask() {
  GELOGI("do Distribute.");
  for (auto &task : cpu_task_list_) {
@@ -3075,19 +3134,11 @@ Status DavinciModel::DistributeTask() {
    GE_CHK_STATUS_RET(task->Distribute());
  }

  task_desc_info_.clear();
  bool flag = GetL1FusionEnableOption();
  char skt_enable_env[MMPA_MAX_PATH] = { 0x00 };
  INT32 res = mmGetEnv("SKT_ENABLE", skt_enable_env, MMPA_MAX_PATH);
  int64_t env_flag = (res == EN_OK) ? std::strtol(skt_enable_env, nullptr, kDecimal) : 0;
  if (env_flag != 0) {
    flag = true;
  }

  const auto &model_task_def = ge_model_->GetModelTaskDefPtr();
  for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) {
    auto &task_def = model_task_def->task(task_index);
    auto &task = task_list_.at(task_index);
    GE_CHECK_NOTNULL(task);
    GE_CHK_STATUS_RET(task->Distribute(), "Task[%zu] distribute fail", task_index);
    // for data dump
    auto op_index = std::max(task_def.kernel().context().op_index(),
@@ -3107,33 +3158,9 @@ Status DavinciModel::DistributeTask() {
    GE_IF_BOOL_EXEC(no_need_profiling, continue);

    SaveDumpOpInfo(runtime_param_, op, task->GetTaskID(), task->GetStreamId());
    // Load task info for profiling
    TaskDescInfo task_desc_info;
    if (!om_name_.empty()) {
      task_desc_info.model_name = om_name_;
    } else {
      task_desc_info.model_name = name_;
    }
    task_desc_info.op_name = op->GetName();
    task_desc_info.block_dim = task_def.kernel().block_dim();
    task_desc_info.task_id = task->GetTaskID();
    task_desc_info.stream_id = task->GetStreamId();
    task_desc_info.shape_type = "static";
    task_desc_info.cur_iter_num = 0;
    profiler_report_op_info_[task_desc_info.op_name] =
      std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id);
    task_desc_info_.emplace_back(task_desc_info);
    if (flag) {
      if (task->GetSktTaskID() != 0xFFFFFFFF) {
        TaskDescInfo task_desc_info;
        string op_name = "super_kernel_" + to_string(task_index);
        task_desc_info.op_name = op_name;
        task_desc_info.task_id = task->GetSktTaskID();
        profiler_report_op_info_[task_desc_info.op_name] =
          std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id);
        task_desc_info_.emplace_back(task_desc_info);
      }
    }

    // save task info for profiling
    SaveProfilingTaskDescInfo(op, task, task_def, task_index);
  }
  // launch dump kernel to aicpu
  GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "Load dump info failed.");
--- a/ge/graph/load/new_model_manager/davinci_model.h
+++ b/ge/graph/load/new_model_manager/davinci_model.h
@@ -623,6 +623,9 @@ class DavinciModel {

  Status DistributeTask();

  void SaveProfilingTaskDescInfo(const OpDescPtr &op, const TaskInfoPtr &task,
                                 const domi::TaskDef &task_def, size_t task_index);

  uint8_t *MallocFeatureMapMem(size_t data_size);

  uint8_t *MallocWeightsMem(size_t weights_size);
--- a/ge/hybrid/executor/worker/execution_engine.cc
+++ b/ge/hybrid/executor/worker/execution_engine.cc
@@ -159,27 +159,9 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel *
  }

  GELOGD("GetTaskDescInfo of node [%s] start.", node->GetName().c_str());
  auto op_desc = node->GetOpDesc();
  std::string op_name = op_desc->GetName();
  std::string dynamic_model_name = model->GetModelName();
  uint32_t task_id = context_->GetTaskId();
  uint32_t stream_id = context_->GetStreamId();
  TaskDescInfo tmp_task_desc_info;
  tmp_task_desc_info.model_name = dynamic_model_name;
  tmp_task_desc_info.op_name = op_name;
  tmp_task_desc_info.block_dim = 0;
  auto task_defs = model->GetTaskDefs(node);
  if (task_defs != nullptr && (*task_defs).size() > 0) {
    const auto &task_def = (*task_defs)[0];
    tmp_task_desc_info.block_dim = task_def.kernel().block_dim();
  }
  tmp_task_desc_info.task_id = task_id;
  tmp_task_desc_info.stream_id = stream_id;
  tmp_task_desc_info.shape_type = "dynamic";
  tmp_task_desc_info.cur_iter_num = graph_context_->iteration;
  GELOGD("GetTaskDescInfo of node [%s] end, task_id[%u], stream_id[%u]",
         node->GetName().c_str(), task_id, stream_id);
  task_desc_info.emplace_back(tmp_task_desc_info);
  task_desc_info = context_->GetProfilingTaskDescInfo();
  context_->ClearProfilingTaskDescInfo();

  return SUCCESS;
 }

@@ -247,7 +229,6 @@ Status NodeDoneCallback::ProfilingReport() {

  GELOGD("ProfilingReport of node [%s] model [%s] start.", node->GetName().c_str(), model->GetModelName().c_str());
  std::vector<TaskDescInfo> task_desc_info;
  TaskDescInfo tmp_task_desc_info;
  auto profiling_ret = GetTaskDescInfo(node, model, task_desc_info);
  if (profiling_ret != RT_ERROR_NONE) {
    GELOGE(profiling_ret, "Get task info of node[%s] failed.", node->GetName().c_str());
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
@@ -182,16 +182,8 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()>
    }
    RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] Start");
    GE_CHK_STATUS_RET_NOLOG((*it)->LaunchKernel(context.GetStream()));
    uint32_t task_id = 0;
    uint32_t stream_id = 0;
    rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(rt_ret, "Get task_id and stream_id failed.");
      return rt_ret;
    }
    context.SetTaskId(task_id);
    context.SetStreamId(stream_id);
    GELOGD("AiCore node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id);
    // save profiling data
    (void)context.SaveProfilingTaskDescInfo(kTaskTypeAicore, (*it)->GetBlockDim());
    RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End");
    RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End");
  }
--- a/ge/hybrid/node_executor/aicore/aicore_op_task.h
+++ b/ge/hybrid/node_executor/aicore/aicore_op_task.h
@@ -48,6 +48,8 @@ class AiCoreOpTask {

  bool GetClearAtomic() const {return clear_atomic_;}

  uint32_t GetBlockDim() const {return block_dim_;}

 protected:
  Status UpdateTilingInfo(TaskContext &context);
  virtual std::string GetKeyForOpParamSize() const;
--- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
+++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
@@ -190,16 +190,8 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void(

  HYBRID_CHK_STATUS_RET(LaunchTask(context), "[%s] Failed to launch task", node_name_.c_str());

  uint32_t task_id = 0;
  uint32_t stream_id = 0;
  rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "Get task_id and stream_id failed.");
    return rt_ret;
  }
  context.SetTaskId(task_id);
  context.SetStreamId(stream_id);
  GELOGD("AiCpu node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id);
  // save profiling data
  (void)context.SaveProfilingTaskDescInfo(kTaskTypeAicpu, 0);

  auto callback = [=, &context]() {
    GELOGD("Node[%s] callback start.", node_name_.c_str());
--- a/ge/hybrid/node_executor/task_context.cc
+++ b/ge/hybrid/node_executor/task_context.cc
@@ -21,6 +21,7 @@
 #include "graph/debug/ge_attr_define.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 #include "hybrid/executor/subgraph_executor.h"
 #include "common/profiling/profiling_manager.h"

 namespace ge {
 namespace hybrid {
@@ -498,5 +499,42 @@ bool TaskContext::NeedCallback() {
 Status TaskContext::Synchronize() {
  return execution_context_->Synchronize(GetStream());
 }

 Status TaskContext::SaveProfilingTaskDescInfo(uint32_t task_type, uint32_t block_dim) {
  if (ProfilingManager::Instance().ProfilingModelExecuteOn()) {
    const NodeItem &node_item = GetNodeItem();
    auto op_desc = node_item.GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);

    uint32_t task_id = 0;
    uint32_t stream_id = 0;
    rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); // must be called after Launch kernel
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(rt_ret, "Get task_id and stream_id failed.");
      return rt_ret;
    }
    GELOGD("Node[%s] task_id: %u, stream_id: %u.", GetNodeName(), task_id, stream_id);

    const GraphExecutionContext * graph_context = GetExecutionContext();
    GE_CHECK_NOTNULL(graph_context);
    const HybridModel *model = graph_context->model;
    GE_CHECK_NOTNULL(model);

    std::string op_name = op_desc->GetName();
    std::string dynamic_model_name = model->GetModelName();
    TaskDescInfo tmp_task_desc_info;
    tmp_task_desc_info.model_name = dynamic_model_name;
    tmp_task_desc_info.op_name = op_name;
    tmp_task_desc_info.block_dim = block_dim;
    tmp_task_desc_info.task_type = task_type;
    tmp_task_desc_info.task_id = task_id;
    tmp_task_desc_info.stream_id = stream_id;
    tmp_task_desc_info.shape_type = "dynamic";
    tmp_task_desc_info.cur_iter_num = iteration_;
    task_desc_info.emplace_back(tmp_task_desc_info);
  }

  return SUCCESS;
 }
 }  // namespace hybrid
 }  // namespace ge
--- a/ge/hybrid/node_executor/task_context.h
+++ b/ge/hybrid/node_executor/task_context.h
@@ -22,6 +22,7 @@
 #include <vector>
 #include "common/properties_manager.h"
 #include "external/ge/ge_api_error_codes.h"
 #include "framework/common/ge_types.h"
 #include "hybrid/common/tensor_value.h"
 #include "hybrid/common/npu_memory_allocator.h"
 #include "hybrid/executor/rt_callback_manager.h"
@@ -108,6 +109,10 @@ class TaskContext {
  void SetForceInferShape(bool force_infer_shape);
  void *handle_ = nullptr;

  const std::vector<TaskDescInfo>& GetProfilingTaskDescInfo() const { return task_desc_info; }
  Status SaveProfilingTaskDescInfo(uint32_t task_type, uint32_t block_dim);
  void ClearProfilingTaskDescInfo() { task_desc_info.clear(); }

 private:
  TaskContext(GraphExecutionContext *execution_context,
              const NodeItem *node_item,
@@ -127,6 +132,7 @@ class TaskContext {
  uint64_t iteration_ = 0;
  uint32_t task_id_ = 0;
  uint32_t stream_id_ = 0;
  std::vector<TaskDescInfo> task_desc_info;
 };
 }  // namespace hybrid
 }  // namespace ge
--- a/ge/single_op/single_op.cc
+++ b/ge/single_op/single_op.cc
@@ -70,6 +70,7 @@ Status ProfilingTaskInfo(OpTask *op_task, const string &shape_type) {
  tmp_task_desc_info.stream_id = stream_id;
  tmp_task_desc_info.shape_type = shape_type;
  tmp_task_desc_info.cur_iter_num = 0;
  tmp_task_desc_info.task_type = op_task->GetTaskType();
  GELOGD("GetTaskDescInfo of op [%s] end, task_id[%u], stream_id[%u]", op_name.c_str(), task_id, stream_id);
  task_desc_info.emplace_back(tmp_task_desc_info);

--- a/ge/single_op/task/op_task.cc
+++ b/ge/single_op/task/op_task.cc
@@ -145,6 +145,8 @@ Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
  return UNSUPPORTED;
 }

 uint32_t OpTask::GetTaskType() const { return kTaskTypeInvalid; }

 TbeOpTask::~TbeOpTask() {
  if (sm_desc_ != nullptr) {
    (void)rtMemFreeManaged(sm_desc_);
@@ -161,6 +163,8 @@ size_t TbeOpTask::GetArgSize() const { return arg_size_; }

 const std::string &TbeOpTask::GetStubName() const { return stub_name_; }

 uint32_t TbeOpTask::GetTaskType() const { return kTaskTypeAicore; }

 Status TbeOpTask::LaunchKernel(rtStream_t stream) {
  GELOGD("To invoke rtKernelLaunch. task = %s, block_dim = %u", this->stub_name_.c_str(), block_dim_);
  auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
@@ -802,6 +806,8 @@ Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam &param) {
  return DoUpdateArgTable(param, false);
 }

 uint32_t AiCpuBaseTask::GetTaskType() const { return kTaskTypeAicpu; }

 void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data());
  arg_count = io_addr_host_.size();
--- a/ge/single_op/task/op_task.h
+++ b/ge/single_op/task/op_task.h
@@ -52,6 +52,7 @@ class OpTask {
                              std::vector<GeTensorDesc> &output_desc,
                              std::vector<DataBuffer> &output_buffers,
                              rtStream_t stream);
  virtual uint32_t GetTaskType() const;

 protected:
  Status DoUpdateArgTable(const SingleOpModelParam &param, bool keep_workspace);
@@ -85,6 +86,7 @@ class TbeOpTask : public OpTask {
  size_t GetArgSize() const;
  const std::string &GetStubName() const;
  void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size);
  uint32_t GetTaskType() const override;

 private:
  friend class SingleOpModel;
@@ -113,6 +115,8 @@ class AiCpuBaseTask : public OpTask {
  ~AiCpuBaseTask() override;
  UnknowShapeOpType GetUnknownType() const { return unknown_type_; }
  Status UpdateArgTable(const SingleOpModelParam &param) override;
  uint32_t GetTaskType() const override;

 protected:
  Status UpdateIoAddr(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
  Status SetInputConst();
--- a/inc/framework/common/ge_types.h
+++ b/inc/framework/common/ge_types.h
@@ -41,12 +41,7 @@ enum FrameworkType {
 };

 const std::map<std::string, std::string> kFwkTypeToStr = {
    {"0", "Caffe"},
    {"1", "MindSpore"},
    {"3", "TensorFlow"},
    {"4", "Android_NN"},
    {"5", "Onnx"}
 };
  {"0", "Caffe"}, {"1", "MindSpore"}, {"3", "TensorFlow"}, {"4", "Android_NN"}, {"5", "Onnx"}};

 enum OpEngineType {
  ENGINE_SYS = 0,  // default engine
@@ -61,6 +56,11 @@ enum InputAippType { DATA_WITHOUT_AIPP = 0, DATA_WITH_STATIC_AIPP, DATA_WITH_DYN
 const char *const GE_ENGINE_ATTR_MEM_TYPE_HBM = "HBM";
 const char *const GE_OPTION_EXEC_PLACEMENT = "ge.exec.placement";

 // profiling data
 const uint32_t kTaskTypeAicore = 0;
 const uint32_t kTaskTypeAicpu = 1;
 const uint32_t kTaskTypeInvalid = 0xFFFF;

 // Data cache, including data address and length
 struct DataBuffer {
 public:
@@ -256,6 +256,7 @@ struct TaskDescInfo {
  uint32_t stream_id;
  std::string shape_type;
  int64_t cur_iter_num;
  uint32_t task_type;
 };

 // Profiling info of graph
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit dc6cceb67bc82b567bcbd6f415776644253e1467
 Subproject commit b00c50c2a8c2ce06929b27f7b74185a950737ec8
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit 4e72aae41e78af1a19cd965da4a45cbd988b9a75
 Subproject commit f0109a2c70981d74932bb38bb56722caff3323a5