!1616 Optimize performance of single_op executor.

From: @zhao_zhixuan Reviewed-by: Signed-off-by:
4 years ago · 14363b0f91
--- a/ge/graph/passes/reshape_recovery_pass.cc
+++ b/ge/graph/passes/reshape_recovery_pass.cc
@@ -60,7 +60,7 @@ Status InsertReshapeIfNeed(const NodePtr &node) {
             node->GetName().c_str(), src_anchor->GetIdx(), dst_node->GetName().c_str(), dst_anchor->GetIdx());
      GE_CHECK_NOTNULL(dst_node);
      GE_CHECK_NOTNULL(dst_node->GetOpDesc());
      auto dst_tensor = dst_node->GetOpDesc()->GetInputDescPtr(dst_anchor->GetIdx());
      auto dst_tensor = dst_node->GetOpDesc()->MutableInputDesc(dst_anchor->GetIdx());
      GE_CHECK_NOTNULL(dst_tensor);
      bool is_dynamic = false;
      const auto &src_tensor_dims = src_tensor->GetShape().GetDims();
@@ -71,6 +71,12 @@ Status InsertReshapeIfNeed(const NodePtr &node) {
               dst_node->GetName().c_str());
        is_dynamic = true;
      }
      if (dst_node->GetType() == NETOUTPUT && is_dynamic) {
        // NetOutput shape must be continuous when dynamic shape.
        // Otherwise, there may be an error waiting for the shape refresh to time out during execution.
        dst_tensor->SetShape(src_tensor->GetShape());
        continue;
      }
      bool is_need_insert_reshape = src_tensor_dims != dst_tensor_dims &&
                                    !is_dynamic;
      if (is_need_insert_reshape) {
--- a/ge/hybrid/model/hybrid_model_builder.cc
+++ b/ge/hybrid/model/hybrid_model_builder.cc
@@ -291,6 +291,46 @@ Status HybridModelBuilder::ParseForceInfershapeNodes(const NodePtr &node, NodeIt
  return SUCCESS;
 }

 Status HybridModelBuilder::ParseDependencies(NodeItem &node_item, const std::vector<string> &dependencies,
                                             std::set<NodePtr> &dependent_for_shape_inference) {
  for (const auto &input_name : dependencies) {
    int input_index = node_item.op_desc->GetInputIndexByName(input_name);
    if (input_index < 0) {
      GELOGE(INTERNAL_ERROR, "[Get][InputIndex]failed, node:[%s] inputname: %s.",
             node_item.NodeName().c_str(), input_name.c_str());
      REPORT_CALL_ERROR("E19999", "GetInputIndexByName failed, node:[%s] inputname: %s.",
                        node_item.NodeName().c_str(), input_name.c_str());
      return INTERNAL_ERROR;
    }

    const auto &in_anchor = node_item.node->GetInDataAnchor(input_index);
    GE_CHECK_NOTNULL(in_anchor);
    const auto &peer_out_anchor = in_anchor->GetPeerOutAnchor();
    GE_CHECK_NOTNULL(peer_out_anchor);
    const auto &src_node = peer_out_anchor->GetOwnerNode();
    GE_CHECK_NOTNULL(src_node);
    auto src_node_item = MutableNodeItem(src_node);
    GE_CHECK_NOTNULL(src_node_item);
    if (src_node_item->NodeType() == DATA) {
      auto op_desc = src_node_item->GetOpDesc();
      GE_CHECK_NOTNULL(op_desc);
      auto tensor = op_desc->MutableInputDesc(0);
      if (AttrUtils::HasAttr(tensor, ATTR_NAME_VALUE)) {
        GELOGD("Skip d2h memcpy, get hostmem from node %s.", src_node_item->NodeName().c_str());
        continue;
      }
    }
    src_node_item->to_const_output_id_list.emplace(peer_out_anchor->GetIdx());
    dependent_for_shape_inference.emplace(src_node);
    host_input_value_dependencies_[&node_item].emplace_back(peer_out_anchor->GetIdx(), src_node_item);
    GELOGD("[%s] Dependent added from output of [%s:%d]",
           node_item.NodeName().c_str(),
           src_node_item->NodeName().c_str(),
           peer_out_anchor->GetIdx());
  }
  return SUCCESS;
 }

 Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const std::vector<string> &dependencies) {
  std::set<NodePtr> dependent_for_shape_inference;
  std::set<NodePtr> dependent_for_execution;
@@ -357,31 +397,7 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s
           src_node_item->NodeName().c_str());
  }

  for (const auto &input_name : dependencies) {
    int input_index = node_item.op_desc->GetInputIndexByName(input_name);
    if (input_index < 0) {
      GELOGE(INTERNAL_ERROR, "[Get][InputIndex]failed, node:[%s] inputname: %s.",
             node_item.NodeName().c_str(), input_name.c_str());
      REPORT_CALL_ERROR("E19999", "GetInputIndexByName failed, node:[%s] inputname: %s.",
                        node_item.NodeName().c_str(), input_name.c_str());
      return INTERNAL_ERROR;
    }

    const auto &in_anchor = ge_node->GetInDataAnchor(input_index);
    GE_CHECK_NOTNULL(in_anchor);
    const auto &peer_out_anchor = in_anchor->GetPeerOutAnchor();
    GE_CHECK_NOTNULL(peer_out_anchor);
    const auto &src_node = peer_out_anchor->GetOwnerNode();
    GE_CHECK_NOTNULL(src_node);
    auto src_node_item = MutableNodeItem(src_node);
    src_node_item->to_const_output_id_list.emplace(peer_out_anchor->GetIdx());
    dependent_for_shape_inference.emplace(src_node);
    host_input_value_dependencies_[&node_item].emplace_back(peer_out_anchor->GetIdx(), src_node_item);
    GELOGD("[%s] Dependent added from output of [%s:%d]",
           node_item.NodeName().c_str(),
           src_node_item->NodeName().c_str(),
           peer_out_anchor->GetIdx());
  }
  GE_CHK_STATUS_RET(ParseDependencies(node_item, dependencies, dependent_for_shape_inference));

  GE_CHK_STATUS_RET(ParseDependentForFusedSubgraph(node_item, dependent_for_shape_inference));
  for (const auto &dep_node : dependent_for_shape_inference) {
--- a/ge/hybrid/model/hybrid_model_builder.h
+++ b/ge/hybrid/model/hybrid_model_builder.h
@@ -65,6 +65,8 @@ class HybridModelBuilder {
  Status ParseForceInfershapeNodes(const NodePtr &node, NodeItem &node_item);
  Status CollectParallelGroups(NodeItem *node_item);
  Status ParseDependentInputNodes(NodeItem &node_item, const std::vector<string> &dependencies);
  Status ParseDependencies(NodeItem &node_item, const std::vector<string> &dependencies,
                           std::set<NodePtr> &dependent_for_shape_inference);
  Status ParseDependentForFusedSubgraph(NodeItem &node_item, std::set<ge::NodePtr> &dependencies);
  Status ParseDependentByParallelGroup();
  Status IndexTaskDefs();
--- a/ge/hybrid/node_executor/aicore/aicore_op_task.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_op_task.cc
@@ -401,9 +401,8 @@ Status AiCoreOpTask::UpdateTilingInfo(TaskContext &context) {
  }

  RECORD_EXECUTION_EVENT(execution_context, context.GetNodeName(), "[CopyTilingInfo] Start");
  GE_CHK_RT_RET(rtMemcpy(tiling_buffer_->GetData(), tiling_buffer_->GetSize(),
                         tiling_data_.c_str(), tiling_data_.size(),
                         RT_MEMCPY_HOST_TO_DEVICE));
  GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_->GetData(), tiling_buffer_->GetSize(), tiling_data_.c_str(),
                              tiling_data_.size(), RT_MEMCPY_HOST_TO_DEVICE_EX, context.GetStream()));
  RECORD_EXECUTION_EVENT(execution_context, context.GetNodeName(), "[CopyTilingInfo] End");

  GELOGD("[%s] Done updating tiling info for task: [%s]", node->GetName().c_str(), stub_name_.c_str());
--- a/ge/single_op/single_op.cc
+++ b/ge/single_op/single_op.cc
@@ -361,6 +361,37 @@ Status DynamicSingleOp::SetHostTensorValue(const std::vector<std::pair<size_t, u
  return SUCCESS;
 }

 Status DynamicSingleOp::SetHostTensorValue(const vector<GeTensorDesc> &input_desc,
                                           const vector<DataBuffer> &input_buffers) {
  for (auto &tensor_map : tensor_with_hostmem_) {
    auto index = static_cast<size_t>(tensor_map.first);
    if (index >= input_desc.size() || index >= input_buffers.size()) {
      GELOGE(INTERNAL_ERROR, "[Check][Size]Index %zu should smaller then input desc size %zu "
             "and input buffers size %zu.", index, input_desc.size(), input_buffers.size());
      return INTERNAL_ERROR;
    }
    auto ge_tensor_desc = input_desc[index];
    // reconstruct GeTensor by DataBuffer
    GeTensorPtr ge_tensor = MakeShared<GeTensor>(ge_tensor_desc);
    GE_CHECK_NOTNULL(ge_tensor);
    GELOGD("The %zu tensor input type is host, desc data type is %d, input buffer addr is %p, size is %ld.",
           index, ge_tensor_desc.GetDataType(), input_buffers[index].data, input_buffers[index].length);
    if (ge_tensor->SetData(reinterpret_cast<uint8_t *>(input_buffers[index].data),
                           static_cast<size_t>(input_buffers[index].length)) != SUCCESS) {
      GELOGE(INTERNAL_ERROR, "[Set][Data]Failed to set data of ge tensor.");
      return INTERNAL_ERROR;
    }
    for (auto &tensor_desc : tensor_map.second) {
      GE_CHECK_NOTNULL(tensor_desc);
      if (!AttrUtils::SetTensor(tensor_desc, ATTR_NAME_VALUE, ge_tensor)) {
        GELOGE(FAILED, "[Set][ATTR_NAME_VALUE]Failed to set ATTR_NAME_VALUE.");
        return FAILED;
      }
    }
  }
  return SUCCESS;
 }

 Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
                                     const vector<DataBuffer> &input_buffers,
                                     vector<GeTensorDesc> &output_desc,
@@ -374,6 +405,7 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
  if (!inputs_size.empty()) {
    StreamResource *stream_resource  = SingleOpManager::GetInstance().GetResource(resource_id_, stream_);
    GE_CHK_STATUS_RET_NOLOG(UpdateInputsBufferAddr(stream_resource, stream_, inputs_size, update_buffers));
    GE_CHK_STATUS_RET_NOLOG(SetHostTensorValue(input_desc, input_buffers));
  }

  if (hybrid_model_executor_ != nullptr) {
--- a/ge/single_op/single_op.h
+++ b/ge/single_op/single_op.h
@@ -81,9 +81,12 @@ class DynamicSingleOp {
                        std::vector<DataBuffer> &outputs) const;
  Status SetHostTensorValue(const std::vector<std::pair<size_t, uint64_t>> &inputs_size,
                            const vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &input_buffers);
  Status SetHostTensorValue(const vector<GeTensorDesc> &input_desc, const vector<DataBuffer> &input_buffers);
  std::unique_ptr<OpTask> op_task_;
  std::unique_ptr<hybrid::HybridModel> hybrid_model_;
  std::unique_ptr<hybrid::HybridModelExecutor> hybrid_model_executor_;
  std::map<int32_t, std::vector<GeTensorDescPtr>> tensor_with_hostmem_;

  uintptr_t resource_id_ = 0;
  std::mutex *stream_mutex_;
  rtStream_t stream_ = nullptr;
--- a/ge/single_op/single_op_model.cc
+++ b/ge/single_op/single_op_model.cc
@@ -235,6 +235,13 @@ Status SingleOpModel::LoadAllNodes() {

    if (op_type == DATA_TYPE || op_type == AIPP_DATA_TYPE) {
      data_ops_.emplace_back(op_desc);
      auto tensor = op_desc->MutableInputDesc(0);
      if (AttrUtils::HasAttr(tensor, ATTR_NAME_VALUE)) {
        int32_t index = 0;
        (void) AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, index);
        GELOGD("Node %s, index %d, has host mem.", node->GetName().c_str(), index);
        op_with_hostmem_[index] = node;
      }
      continue;
    }

@@ -616,6 +623,7 @@ Status SingleOpModel::BuildDynamicOp(StreamResource &resource, DynamicSingleOp &
  if (need_hybrid_model) {
    GELOGD("Build single op HybridModel.");
    GE_CHK_STATUS_RET_NOLOG(hybrid::NodeExecutorManager::GetInstance().EnsureInitialized());
    GE_CHK_STATUS(SetHostMemTensor(single_op), "[Init][HostMem]Failed.");
    auto root_model = model_helper_.GetGeRootModel();
    GE_CHECK_NOTNULL(root_model);
    root_model->SetRootGraph(GraphUtils::GetComputeGraph(ge_model->GetGraph()));
@@ -634,4 +642,28 @@ Status SingleOpModel::BuildDynamicOp(StreamResource &resource, DynamicSingleOp &
  }
  return BuildTaskListForDynamicOp(&resource, single_op);
 }

 Status SingleOpModel::SetHostMemTensor(DynamicSingleOp &single_op) {
  for (auto &node_map : op_with_hostmem_) {
    auto node = node_map.second;
    auto out_anchor = node->GetOutDataAnchor(0);
    GE_CHECK_NOTNULL(out_anchor);
    auto in_anchors = out_anchor->GetPeerInDataAnchors();
    vector<GeTensorDescPtr> tensor_descs;
    auto idx = node_map.first;
    for (auto anchor : in_anchors) {
      GE_CHECK_NOTNULL(anchor);
      auto output_node = anchor->GetOwnerNode();
      GE_CHECK_NOTNULL(output_node);
      auto op_desc = output_node->GetOpDesc();
      GE_CHECK_NOTNULL(op_desc);
      auto tensor_desc = op_desc->MutableInputDesc(anchor->GetIdx());
      tensor_descs.emplace_back(tensor_desc);
      GELOGD("Get %d th input tensor desc of %s by %d data node: %s.", anchor->GetIdx(),
             output_node->GetName().c_str(), idx, node->GetName().c_str());
    }
    single_op.tensor_with_hostmem_[idx] = tensor_descs;
  }
  return SUCCESS;
 }
 }  // namespace ge
--- a/ge/single_op/single_op_model.h
+++ b/ge/single_op/single_op_model.h
@@ -77,6 +77,7 @@ class SingleOpModel {
  static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param);
  void ParseArgTable(OpTask *task, SingleOp &op);
  Status InitHybridModelExecutor(const StreamResource &resource, const GeModelPtr &ge_model, SingleOp &single_op);
  Status SetHostMemTensor(DynamicSingleOp &single_op);

  std::string model_name_;
  uint32_t model_id_ = 0;
@@ -86,6 +87,7 @@ class SingleOpModel {
  ModelHelper model_helper_;

  map<uint32_t, NodePtr> op_list_;
  map<int32_t, NodePtr> op_with_hostmem_;
  SingleOpModelParam model_params_;

  std::vector<ptrdiff_t> input_offset_list_;
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit 68474443bd6966eade3e32d6dfa2cc62f5872d2c
 Subproject commit 8dd3448e2f0150c51266bc120bdd5d171a003e6b
--- a/tests/ut/ge/graph/passes/reshape_recovery_pass_unittest.cc
+++ b/tests/ut/ge/graph/passes/reshape_recovery_pass_unittest.cc
@@ -42,8 +42,8 @@ ut::GraphBuilder Graph1Builder() {
  auto var1 = builder.AddNode("var1", "Variable", 0, 1, FORMAT_ND, DT_FLOAT, {-1});
  auto const1 = builder.AddNode("const1", "Const", 0, 1, FORMAT_ND, DT_FLOAT, {1, 1, 224, 224});
  auto transdata2 = builder.AddNode("transdata2", "Transdata", 1, 1, FORMAT_ND, DT_FLOAT, {224, 224});
  auto transdata1 = builder.AddNode("transdata1", "Transdata", 1, 1, FORMAT_ND, DT_FLOAT, {224, 224});
  auto netoutput1 = builder.AddNode("netoutput1", "Netoutput", 2, 0);
  auto transdata1 = builder.AddNode("transdata1", "Transdata", 1, 1, FORMAT_ND, DT_FLOAT, {-1, 224});
  auto netoutput1 = builder.AddNode("netoutput1", "NetOutput", 2, 0);

  builder.AddDataEdge(var1, 0, transdata1, 0);
  builder.AddDataEdge(const1, 0, transdata2, 0);
@@ -58,10 +58,10 @@ TEST_F(UtestReshapeRecoveryPass, reshape_recovery_with_dynamic_shape) {
  auto builder = Graph1Builder();
  auto graph = builder.GetGraph();
  ReshapeRecoveryPass reshape_recovery_pass;
  EXPECT_EQ(graph->GetDirectNodesSize(),5);
  EXPECT_EQ(graph->GetDirectNodesSize(), 5);
  Status ret = reshape_recovery_pass.Run(graph);
  EXPECT_EQ(ret, SUCCESS);
  EXPECT_EQ(graph->GetDirectNodesSize(),8);
  EXPECT_EQ(graph->GetDirectNodesSize(), 7);

  auto reshape1 = graph->FindNode("Reshape_ReshapeRecoveryPass_0");
  EXPECT_NE(reshape1, nullptr);
--- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc
+++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc
@@ -19,9 +19,9 @@
 #include <vector>
 #include "runtime/rt.h"

 #include "graph/utils/node_utils.h"
 #define protected public
 #define private public
 #include "graph/utils/node_utils.h"
 #include "hybrid/model/hybrid_model_builder.h"
 #include "hybrid/model/hybrid_model.h"
 #include "hybrid/node_executor/node_executor.h"
@@ -111,14 +111,26 @@ TEST_F(UtestGeHybrid, aicore_op_task_init_success) {

 TEST_F(UtestGeHybrid, task_update_tiling_info) {
  auto aicore_task = std::unique_ptr<hybrid::AiCoreOpTask>(new(std::nothrow)hybrid::AiCoreOpTask());
  aicore_task->is_single_op_ = true;
  auto graph = make_shared<ComputeGraph>("graph");
  OpDescPtr op_desc = CreateOpDesc("Add", "Add");
  ge::AttrUtils::SetStr(op_desc, "compile_info_key", "key");
  ge::AttrUtils::SetStr(op_desc, "compile_info_json", "json");
  ge::AttrUtils::SetBool(op_desc, "support_dynamicshape", true);
  ge::AttrUtils::SetInt(op_desc, "op_para_size", 1);
  auto node = graph->AddNode(op_desc);
  optiling::OpRunInfo tiling_info;
  ASSERT_EQ(aicore_task->CalcTilingInfo(node, tiling_info), SUCCESS);

  std::unique_ptr<NodeItem> node_item;
  NodeItem::Create(node, node_item);
  node_item->input_start = 0;
  node_item->output_start = 0;

  GraphExecutionContext execution_context;
  SubgraphContext subgraph_context(nullptr, &execution_context);
  NodeState node_state(*node_item, &subgraph_context);
  auto task_context = TaskContext::Create(&node_state, &execution_context, &subgraph_context);
  ASSERT_TRUE(task_context != nullptr);
  ASSERT_EQ(aicore_task->InitTilingInfo(*op_desc), SUCCESS);
  ASSERT_EQ(aicore_task->UpdateTilingInfo(*task_context), SUCCESS);
 }

 TEST_F(UtestGeHybrid, index_taskdefs_failed) {
@@ -669,3 +681,33 @@ TEST_F(UtestGeHybrid, TestParseDependentInputNodesForHccl) {
  ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution.size(), 0);
  ASSERT_EQ(model.node_items_[node_2]->dependents_for_execution.size(), 1);
 }

 TEST_F(UtestGeHybrid, TestParseDependencies) {
  // make graph
  ut::GraphBuilder graph_builder = ut::GraphBuilder("graph");
  auto data = graph_builder.AddNode("Data", "Data", 0, 1);
  auto netoutput = graph_builder.AddNode("Netoutput", "NetOutput", 1, 0);
  graph_builder.AddDataEdge(data, 0, netoutput, 0);
  auto graph = graph_builder.GetGraph();

  GeRootModelPtr root_model = MakeShared<ge::GeRootModel>(graph);
  HybridModel model(root_model);
  HybridModelBuilder builder(model);

  std::unique_ptr<NodeItem> node_item;
  NodeItem::Create(netoutput, node_item);
  std::unique_ptr<NodeItem> node_item2;
  NodeItem::Create(data, node_item2);
  model.node_items_.emplace(data, std::move(node_item2));

  std::vector<std::string> deps;
  deps.push_back("Data");
  auto op_desc = netoutput->GetOpDesc();
  op_desc->input_name_idx_["Data"] = 0;
  auto data_desc = data->GetOpDesc();
  auto tensor = std::make_shared<GeTensor>();
  auto tensor_desc = data_desc->MutableInputDesc(0);
  AttrUtils::SetTensor(tensor_desc, "_value", tensor);
  std::set<NodePtr> dependent_for_shape_inference;
  ASSERT_EQ(builder.ParseDependencies(*node_item, deps, dependent_for_shape_inference), SUCCESS);
 }
--- a/tests/ut/ge/single_op/single_op_model_unittest.cc
+++ b/tests/ut/ge/single_op/single_op_model_unittest.cc
@@ -27,6 +27,7 @@
 #include "single_op/task/tbe_task_builder.h"
 #undef private
 #undef protected
 #include "graph/passes/graph_builder_utils.h"

 using namespace std;
 using namespace testing;
@@ -223,3 +224,19 @@ TEST_F(UtestSingleOpModel, test_build_dynamic_op) {
  model.BuildDynamicOp(res, dynamic_single_op);
 }

 TEST_F(UtestSingleOpModel, test_host_mem) {
  string model_data_str = "123456789";
  SingleOpModel model("model", model_data_str.c_str(), model_data_str.size());

  // make graph
  ut::GraphBuilder builder = ut::GraphBuilder("graph");
  auto data = builder.AddNode("Data", "Data", 0, 1);
  auto netoutput = builder.AddNode("Netoutput", "NetOutput", 1, 0);
  builder.AddDataEdge(data, 0, netoutput, 0);
  auto graph = builder.GetGraph();
  model.op_with_hostmem_[0] = data;

  std::mutex stream_mu_;
  DynamicSingleOp single_op(0, &stream_mu_, nullptr);
  ASSERT_EQ(model.SetHostMemTensor(single_op), SUCCESS);
 }
--- a/tests/ut/ge/single_op/single_op_unittest.cc
+++ b/tests/ut/ge/single_op/single_op_unittest.cc
@@ -160,4 +160,23 @@ TEST_F(UtestSingleOp, test_singleop_execute_async2) {
  EXPECT_EQ(single_op.running_param_->mem_base, nullptr);
  EXPECT_EQ(single_op.tasks_.size(), 0);
  EXPECT_EQ(single_op.ExecuteAsync(input_buffers, output_buffers), PARAM_INVALID);
 }
 }

 TEST_F(UtestSingleOp, test_set_host_mem) {
  std::mutex stream_mu_;
  DynamicSingleOp single_op(0, &stream_mu_, nullptr);
  
  vector<DataBuffer> input_buffers;
  DataBuffer data_buffer;
  input_buffers.emplace_back(data_buffer);

  vector<GeTensorDesc> input_descs;
  GeTensorDesc tensor_desc1;
  input_descs.emplace_back(tensor_desc1);

  vector<GeTensorDescPtr> op_input_descs;
  auto tensor_desc2 = std::make_shared<GeTensorDesc>();
  op_input_descs.emplace_back(tensor_desc2);
  single_op.tensor_with_hostmem_[0] = op_input_descs;
  EXPECT_EQ(single_op.SetHostTensorValue(input_descs, input_buffers), SUCCESS);
 }