From 5e8f1579e24b3cac117b33177c0cef27767ae7ce Mon Sep 17 00:00:00 2001
From: weiyang <yangwei79@huawei.com>
Date: Sat, 31 Oct 2020 14:26:34 +0800
Subject: [PATCH] fix dynamic shape with while

---
 .../load/new_model_manager/davinci_model.cc   | 30 +++++++++++--------
 .../load/new_model_manager/davinci_model.h    |  2 +-
 .../label_switch_by_index_task_info.cc        |  2 +-
 .../task_info/memcpy_async_task_info.cc       | 22 ++++++++++++--
 .../task_info/memcpy_async_task_info.h        |  2 ++
 ge/graph/passes/subgraph_pass.cc              | 16 +++++++++-
 ge/single_op/single_op_model.cc               |  4 ---
 .../task/aicpu_kernel_task_builder.cc         |  1 +
 8 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc
index cf6493cc..cb37182c 100755
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
@@ -649,7 +649,6 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
   for (const ge::NodePtr &node : compute_graph->GetDirectNode()) {
     auto op_desc = node->GetOpDesc();
     GE_IF_BOOL_EXEC(op_desc == nullptr, continue);
-    GetFixedAddrAttr(op_desc);
     GE_IF_BOOL_EXEC(op_desc->GetType() != VARIABLE, continue);
     GE_IF_BOOL_EXEC(IsBroadCastOpData(node),
                     (void)ge::AttrUtils::SetStr(op_desc, VAR_ATTR_VAR_IS_BROADCAST, "var_is_restore"););
@@ -838,7 +837,8 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
       }
       continue;
     }
-
+    // for dynamic shape with control flow
+    SetLabelForDynamic(node);
     if (IsNoTaskAndDumpNeeded(op_desc)) {
       GELOGD("node[%s] without task, and save op_desc and addr for dump", op_desc->GetName().c_str());
       const RuntimeParam &rts_param = GetRuntimeParam();
@@ -912,6 +912,21 @@ Status DavinciModel::InitInputOutputForDynamic(const ComputeGraphPtr &compute_gr
   return SUCCESS;
 }
 
+void DavinciModel::SetLabelForDynamic(const NodePtr &node) {
+  if (known_node_ && node->GetOpDesc()->GetType() == LABELSWITCHBYINDEX) {
+    for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
+      auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
+      if (peer_out_data_anchor != nullptr) {
+        string tensor_name = node->GetName();
+        auto peer_node = peer_out_data_anchor->GetOwnerNode();
+        (void)AttrUtils::SetStr(peer_node->GetOpDesc(), ATTR_DYNAMIC_SHAPE_FIXED_ADDR, tensor_name);
+        (void)AttrUtils::SetInt(peer_node->GetOpDesc(), ATTR_DYNAMIC_SHAPE_FIXED_ADDR_INDEX, 0);
+        tensor_name_to_peer_output_index_[tensor_name] = 0;
+      }
+    }
+  }
+}
+
 /// @ingroup ge
 /// @brief Data Op Initialize.
 /// @param [in] NodePtr: Data Op.
@@ -3948,15 +3963,4 @@ int64_t DavinciModel::GetFixedAddrsSize(string tensor_name) {
   }
 }
 
-void DavinciModel::GetFixedAddrAttr(const OpDescPtr &op_desc) {
-  if (op_desc->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR) && op_desc->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR_INDEX)) {
-    string tensor_name;
-    (void)AttrUtils::GetStr(op_desc, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, tensor_name);
-    int64_t index = -1;
-    (void)AttrUtils::GetInt(op_desc, ATTR_DYNAMIC_SHAPE_FIXED_ADDR_INDEX, index);
-    if (index >= 0) {
-      tensor_name_to_peer_output_index_[tensor_name] = index;
-    }
-  }
-}
 }  // namespace ge
diff --git a/ge/graph/load/new_model_manager/davinci_model.h b/ge/graph/load/new_model_manager/davinci_model.h
index f41817bb..964057a4 100755
--- a/ge/graph/load/new_model_manager/davinci_model.h
+++ b/ge/graph/load/new_model_manager/davinci_model.h
@@ -838,7 +838,7 @@ class DavinciModel {
                              std::vector<ge::OutputTensorInfo> &outputs);
 
   void ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_info);
-  void GetFixedAddrAttr(const OpDescPtr &op_desc);
+  void SetLabelForDynamic(const NodePtr &node);
 
   bool is_model_has_inited_;
   uint32_t model_id_;
diff --git a/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc b/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc
index f26c19a6..ae7865a4 100644
--- a/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/label_switch_by_index_task_info.cc
@@ -144,7 +144,7 @@ Status LabelSwitchByIndexTaskInfo::CalculateArgs(const domi::TaskDef &task_def,
     GELOGE(FAILED, "Label switch op only have one data input. Now input size is %zu", op_desc->GetInputsSize());
     return FAILED;
   }
-  string input_tensor_name = op_desc->GetInputNameByIndex(0);
+  string input_tensor_name = op_desc->GetName();
   fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(input_tensor_name);
   auto tensor_desc = op_desc->GetInputDesc(0);
   int64_t tensor_size = 0;
diff --git a/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc b/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
index 51e822e2..6eb53c8a 100755
--- a/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
@@ -35,6 +35,7 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da
   kind_ = memcpy_async_.kind();
   dst_max_ = memcpy_async_.dst_max();
   OpDescPtr op_desc = davinci_model_->GetOpByIndex(memcpy_async_.op_index());
+  op_desc_ = op_desc;
   if (op_desc == nullptr) {
     GELOGE(INTERNAL_ERROR, "Task op index:%u out of range", memcpy_async_.op_index());
     return INTERNAL_ERROR;
@@ -45,7 +46,8 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da
     dst_ = reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(src_) + sizeof(void *));
     // for zero copy
     kind_ = RT_MEMCPY_ADDR_DEVICE_TO_DEVICE;
-    GELOGI("MemcpyAsyncTaskInfo src_ %p, dst_ %p, args_offset %u.", src_, dst_, args_offset_);
+    GELOGI("MemcpyAsyncTaskInfo op name %s, src_ %p, dst_ %p, args_offset %u.",
+           op_desc->GetName().c_str(), src_, dst_, args_offset_);
     return SUCCESS;
   }
 
@@ -93,12 +95,22 @@ Status MemcpyAsyncTaskInfo::Distribute() {
 }
 
 Status MemcpyAsyncTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
+  OpDescPtr op_desc = davinci_model_->GetOpByIndex(task_def.memcpy_async().op_index());
   // the num of src and dst size is 2
   uint32_t args_size = sizeof(void *) * 2;
   args_offset_ = davinci_model->GetTotalArgsSize();
   davinci_model->SetTotalArgsSize(args_size);
   davinci_model_ = davinci_model;
   GELOGI("MemcpyAsyncTaskInfo kernel args_size %u, args_offset %u", args_size, args_offset_);
+  string peer_input_name;
+  if (AttrUtils::GetStr(op_desc, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name) && !peer_input_name.empty()) {
+    uint32_t output_index = davinci_model->GetFixedAddrOutputIndex(peer_input_name);
+    fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(peer_input_name);
+    auto tensor_desc = op_desc->GetOutputDesc(output_index);
+    int64_t tensor_size = 0;
+    GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
+    davinci_model->SetTotalFixedAddrsSize(peer_input_name, tensor_size);
+  }
   return SUCCESS;
 }
 
@@ -117,8 +129,12 @@ Status MemcpyAsyncTaskInfo::UpdateArgs() {
 
   vector<void *> io_addrs;
   io_addrs.emplace_back(reinterpret_cast<void *>(src_));
-  io_addrs.emplace_back(reinterpret_cast<void *>(dst_));
-
+  if (op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
+    void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_);
+    io_addrs.emplace_back(fixed_addr);
+  } else {
+    io_addrs.emplace_back(reinterpret_cast<void *>(dst_));
+  }
   davinci_model_->SetTotalIOAddrs(io_addrs);
 
   GELOGI("MemcpyAsyncTaskInfo::UpdateArgs success.");
diff --git a/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h b/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
index 320e7fbc..7e74ab6f 100755
--- a/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
@@ -44,6 +44,8 @@ class MemcpyAsyncTaskInfo : public TaskInfo {
   uint8_t *src_;
   uint64_t count_;
   uint32_t kind_;
+  OpDescPtr op_desc_;
+  int64_t fixed_addr_offset_;
   DavinciModel *davinci_model_ = nullptr;
   uint32_t args_offset_ = 0;
   domi::MemcpyAsyncDef memcpy_async_;
diff --git a/ge/graph/passes/subgraph_pass.cc b/ge/graph/passes/subgraph_pass.cc
index 04e28aaf..d8cc5676 100755
--- a/ge/graph/passes/subgraph_pass.cc
+++ b/ge/graph/passes/subgraph_pass.cc
@@ -149,7 +149,8 @@ Status SubgraphPass::SubgraphOutputNode(const ComputeGraphPtr &graph, const Node
     std::string op_type;
     bool insert_flag = NodeUtils::GetConstOpType(in_node, op_type) ||
                        IsAtomicRequired(in_node, peer_out_anchor->GetIdx()) || IsOutputContinuesRequired(in_node) ||
-                       ((in_node->GetType() == DATA) && (kWhileOpTypes.count(graph->GetParentNode()->GetType()) == 0));
+                       ((in_node->GetType() == DATA) && (kWhileOpTypes.count(graph->GetParentNode()->GetType()) == 0)) ||
+                         (NodeUtils::IsDynamicShape(node) && (kWhileOpTypes.count(in_node->GetType()) != 0));
     if (insert_flag) {
       GELOGD("Insert MemcpyAsync node between %s and %s.", in_node->GetName().c_str(), node->GetName().c_str());
       std::string name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()) + "_Memcpy";
@@ -212,6 +213,19 @@ Status SubgraphPass::WhileBodySubgraph(const ComputeGraphPtr &graph, const NodeP
     return SUCCESS;
   }
 
+  // insert identity between data and labelswitch in while cond subgraph
+  if (NodeUtils::IsDynamicShape(node)) {
+    ComputeGraphPtr while_cond = NodeUtils::GetSubgraph(*node, 0);
+    GE_CHECK_NOTNULL(while_cond);
+    std::vector<NodePtr> cond_data_nodes;
+    for (const auto &n : while_cond->GetDirectNode()) {
+      if (n->GetType() == DATA) {
+        cond_data_nodes.emplace_back(n);
+      }
+    }
+    GE_CHK_STATUS_RET(InsertInputMemcpy(while_cond, cond_data_nodes), "InsertInputMemcpy failed.");
+  }
+
   std::vector<NodePtr> data_nodes;
   std::set<uint32_t> bypass_index;
   NodePtr output_node = nullptr;
diff --git a/ge/single_op/single_op_model.cc b/ge/single_op/single_op_model.cc
index 98d56046..ea9df11d 100755
--- a/ge/single_op/single_op_model.cc
+++ b/ge/single_op/single_op_model.cc
@@ -31,7 +31,6 @@
 #include "task/aicpu_task_builder.h"
 #include "task/aicpu_kernel_task_builder.h"
 #include "task/tbe_task_builder.h"
-#include "graph/load/new_model_manager/model_manager.h"
 
 static std::atomic<std::uint64_t> aicpu_sessionid(0);
 
@@ -278,7 +277,6 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
       GELOGD("Skip task type: %d", static_cast<int>(task_type));
     }
   }
-  GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed.");
   return SUCCESS;
 }
 
@@ -450,8 +448,6 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
       GELOGD("Skip task type: %d", static_cast<int>(task_type));
     }
   }
-  GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed.");
-
   return SUCCESS;
 }
 
diff --git a/ge/single_op/task/aicpu_kernel_task_builder.cc b/ge/single_op/task/aicpu_kernel_task_builder.cc
index b9c5b9d0..150c66e7 100755
--- a/ge/single_op/task/aicpu_kernel_task_builder.cc
+++ b/ge/single_op/task/aicpu_kernel_task_builder.cc
@@ -63,6 +63,7 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task) {
     task.is_custom_ = true;
     task.dump_flag_ |= RT_KERNEL_CUSTOM_AICPU;
     GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc_, so_name), "launch cust aicpu so failed");
+    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed.");
   }
 
   task.num_inputs_ = op_desc_->GetInputsSize();