!1188 dynamic shape over flow

From: @zhou_chao1993 Reviewed-by: @xchu42,@ji_chen Signed-off-by: @ji_chen
4 years ago · 1b845b9ac2
--- a/ge/CMakeLists.txt
+++ b/ge/CMakeLists.txt
@@ -103,6 +103,7 @@ set(TRAIN_SRC_LIST
    "common/profiling/profiling_manager.cc"
    "common/dump/dump_manager.cc"
    "common/dump/dump_properties.cc"
    "common/dump/opdebug_register.cc"
    "common/dump/dump_op.cc"
    "common/profiling/ge_profiling.cc"
    "common/profiling/ge_runner_profiling.cc"
@@ -427,6 +428,7 @@ set(INFER_SRC_LIST
    "common/dump/dump_properties.cc"
    "common/dump/dump_manager.cc"
    "common/dump/dump_op.cc"
    "common/dump/opdebug_register.cc"
    "common/dump/dump_server.cc"
    "common/helper/model_cache_helper.cc"
    "ge_local_engine/engine/host_cpu_engine.cc"
--- a/ge/common/dump/dump_manager.cc
+++ b/ge/common/dump/dump_manager.cc
@@ -104,8 +104,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const DumpProperties &DumpManager::GetDumpProperties(
  uint64_t session_id) {
  std::lock_guard<std::mutex> lock(mutex_);
  // If session_id is not found in dump_properties_map_, operator[] will insert one.
  return dump_properties_map_[session_id];
  auto iter = dump_properties_map_.find(session_id);
  if (iter != dump_properties_map_.end()) {
    return iter->second;
  }
  static DumpProperties default_properties;
  return default_properties;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpManager::AddDumpProperties(
--- a/ge/common/dump/dump_op.cc
+++ b/ge/common/dump/dump_op.cc
@@ -219,9 +219,9 @@ Status DumpOp::LaunchDumpOp() {
  op_mapping_info.set_dump_path(dump_path);
  op_mapping_info.set_flag(kAicpuLoadFlag);
  op_mapping_info.set_dump_step(dump_properties_.GetDumpStep());
  if (!dynamic_model_name_.empty()) {
  op_mapping_info.set_model_id(dynamic_model_id_);
  if (!dynamic_model_name_.empty() && dump_properties_.IsDumpOpen()) {
    op_mapping_info.set_model_name(dynamic_model_name_);
    op_mapping_info.set_model_id(dynamic_model_id_);
  }
  SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info);
  GELOGI("Dump step is %s ,dump path is %s ,in Launch dump op", dump_properties_.GetDumpStep().c_str(),
@@ -253,7 +253,7 @@ Status DumpOp::LaunchDumpOp() {
    }
    op_mapping_info.mutable_task()->Add(std::move(task));
  }
  if (dump_properties_.GetDumpMode() == kDumpAll) {
  if (dump_properties_.GetDumpMode() == kDumpAll || dump_properties_.IsOpDebugOpen()) {
    auto ret = DumpOutput(task);
    if (ret != SUCCESS) {
      GELOGE(ret, "Dump output failed when in dumping all");
--- a/ge/common/dump/dump_properties.h
+++ b/ge/common/dump/dump_properties.h
@@ -81,11 +81,11 @@ class DumpProperties {

  const std::string &GetEnableDumpDebug() const {return enable_dump_debug_;}


 private:
  void CopyFrom(const DumpProperties &other);

  void SetDumpDebugOptions();

  std::string enable_dump_;
  std::string enable_dump_debug_;

--- a/ge/common/dump/opdebug_register.cc
+++ b/ge/common/dump/opdebug_register.cc
@@ -0,0 +1,148 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "opdebug_register.h"

 namespace {
 const size_t kOpDebugMemorySize = 2048UL;
 const size_t kDebugP2pSize = 8UL;
 }  // namespace
 namespace ge {
 OpdebugRegister::~OpdebugRegister() {}

 Status OpdebugRegister::RegisterDebugForModel(rtModel_t model_handle, uint32_t op_debug_mode, DataDumper &data_dumper) {
  GELOGD("Start to register debug for model in overflow");
  auto ret = MallocMemForOpdebug();
  if (ret != SUCCESS) {
    GELOGE(ret, "Malloc memory for opdebug in model overflow failed ,ret:0x%X", ret);
    return ret;
  }
  uint32_t debug_stream_id = 0;
  uint32_t debug_task_id = 0;
  auto rt_ret = rtDebugRegister(model_handle, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "rtDebugRegister error, ret: 0x%X", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
  GELOGD("debug_task_id:%u, debug_stream_id:%u in model overflow", debug_task_id, debug_stream_id);
  data_dumper.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, true);
  return SUCCESS;
 }

 void OpdebugRegister::UnregisterDebugForModel(rtModel_t model_handle) {
  rtError_t rt_ret = RT_ERROR_NONE;
  if (model_handle != nullptr) {
    GELOGD("start to call rtDebugUnRegister in model overflow.");
    rt_ret = rtDebugUnRegister(model_handle);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGW("rtDebugUnRegister failed, ret: 0x%X", rt_ret);
    }
  }

  if (op_debug_addr_ != nullptr) {
    rt_ret = rtFree(op_debug_addr_);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGW("rtFree failed, ret: 0x%X", rt_ret);
    }
    op_debug_addr_ = nullptr;
  }

  if (p2p_debug_addr_ != nullptr) {
    rt_ret = rtFree(p2p_debug_addr_);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGW("rtFree failed, ret: 0x%X", rt_ret);
    }
    p2p_debug_addr_ = nullptr;
  }
  return;
 }

 Status OpdebugRegister::RegisterDebugForStream(rtStream_t stream, uint32_t op_debug_mode, DataDumper &data_dumper) {
  GELOGD("Start to register debug for stream in stream overflow");
  auto ret = MallocMemForOpdebug();
  if (ret != SUCCESS) {
    GELOGE(ret, "Malloc memory for opdebug in stream overflow ,ret:0x%X", ret);
    return ret;
  }

  uint32_t debug_stream_id = 0;
  uint32_t debug_task_id = 0;
 #ifdef ONLY_COMPILE_OPEN_SRC
  auto rt_ret = rtDebugRegisterForStream(stream, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "rtDebugRegisterForStream error, ret: 0x%X", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
 #endif
  GELOGD("debug_task_id:%u, debug_stream_id:%u in stream overflow.", debug_task_id, debug_stream_id);
  data_dumper.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, true);
  return SUCCESS;
 }

 void OpdebugRegister::UnregisterDebugForStream(rtStream_t stream) {
  rtError_t rt_ret = RT_ERROR_NONE;
 #ifdef ONLY_COMPILE_OPEN_SRC
  if (stream != nullptr) {
    GELOGD("start call rtDebugUnRegisterForStream in unknown shape over flow.");
    rt_ret = rtDebugUnRegisterForStream(stream);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGW("rtDebugUnRegisterForStream failed, ret: 0x%X", rt_ret);
    }
  }
 #endif

  if (op_debug_addr_ != nullptr) {
    rt_ret = rtFree(op_debug_addr_);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGW("rtFree failed, ret: 0x%X", rt_ret);
    }
    op_debug_addr_ = nullptr;
  }

  if (p2p_debug_addr_ != nullptr) {
    rt_ret = rtFree(p2p_debug_addr_);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGW("rtFree failed, ret: 0x%X", rt_ret);
    }
    p2p_debug_addr_ = nullptr;
  }
  return;
 }

 Status OpdebugRegister::MallocMemForOpdebug() {
  rtError_t rt_ret = rtMalloc(&op_debug_addr_, kOpDebugMemorySize, RT_MEMORY_DDR);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }

  uint64_t debug_addrs_tmp = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(op_debug_addr_));
  // For data dump, aicpu needs the pointer to pointer that save the real debug address.
  rt_ret = rtMalloc(&p2p_debug_addr_, kDebugP2pSize, RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
  rt_ret = rtMemcpy(p2p_debug_addr_, sizeof(uint64_t), &debug_addrs_tmp, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "rtMemcpy to p2p_addr error: 0x%X", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }

  return SUCCESS;
 }

 }  // namespace ge
--- a/ge/common/dump/opdebug_register.h
+++ b/ge/common/dump/opdebug_register.h
@@ -0,0 +1,44 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef GE_COMMON_DUMP_OPDEBUG_REGISTER_H_
 #define GE_COMMON_DUMP_OPDEBUG_REGISTER_H_

 #include <map>
 #include "common/debug/ge_log.h"
 #include "common/debug/log.h"
 #include "graph/load/model_manager/data_dumper.h"

 namespace ge {
 class OpdebugRegister {
 public:
  OpdebugRegister() = default;
  ~OpdebugRegister();

  Status RegisterDebugForModel(rtModel_t model_handle, uint32_t op_debug_mode, DataDumper &data_dumper);
  void UnregisterDebugForModel(rtModel_t model_handle);

  Status RegisterDebugForStream(rtStream_t stream, uint32_t op_debug_mode, DataDumper &data_dumper);
  void UnregisterDebugForStream(rtStream_t stream);

 private:
  Status MallocMemForOpdebug();

  void *op_debug_addr_ = nullptr;
  void *p2p_debug_addr_ = nullptr;
 };
 }  // namespace ge
 #endif  // GE_COMMON_DUMP_OPDEBUG_REGISTER_H_
--- a/ge/executor/CMakeLists.txt
+++ b/ge/executor/CMakeLists.txt
@@ -17,6 +17,7 @@ set(SRC_LIST
    "../common/dump/dump_properties.cc"
    "../common/dump/dump_manager.cc"
    "../common/dump/dump_op.cc"
    "../common/dump/opdebug_register.cc"
    "../common/profiling/ge_profiling.cc"
    "../graph/load/graph_loader.cc"
    "../graph/execute/graph_execute.cc"
--- a/ge/graph/load/model_manager/data_dumper.h
+++ b/ge/graph/load/model_manager/data_dumper.h
@@ -36,21 +36,9 @@
 namespace ge {
 class DataDumper {
 public:
  explicit DataDumper(const RuntimeParam &rsh)
      : model_name_(),
        model_id_(0),
        runtime_param_(rsh),
        dev_mem_load_(nullptr),
        dev_mem_unload_(nullptr),
        op_list_(),
        input_map_(),
        load_flag_(false),
        device_id_(0),
        global_step_(0),
        loop_per_iter_(0),
        loop_cond_(0),
        compute_graph_(nullptr),
        ref_info_() {}
  DataDumper() : runtime_param_{} {}

  explicit DataDumper(const RuntimeParam &rsh) : runtime_param_(rsh) {}

  ~DataDumper();

@@ -105,10 +93,10 @@ class DataDumper {
  // for inference data dump
  std::string om_name_;

  uint32_t model_id_;
  uint32_t model_id_ = 0;
  const RuntimeParam &runtime_param_;
  void *dev_mem_load_;
  void *dev_mem_unload_;
  void *dev_mem_load_ = nullptr;
  void *dev_mem_unload_ = nullptr;

  struct InnerDumpInfo;
  struct InnerInputMapping;
@@ -119,16 +107,15 @@ class DataDumper {
  uint32_t end_graph_stream_id_ = 0;
  bool is_end_graph_ = false;
  std::multimap<std::string, InnerInputMapping> input_map_;  // release after DavinciModel::Init
  bool load_flag_;
  uint32_t device_id_;
  uintptr_t global_step_;
  uintptr_t loop_per_iter_;
  uintptr_t loop_cond_;
  ComputeGraphPtr compute_graph_;  // release after DavinciModel::Init
  std::map<OpDescPtr, void *> ref_info_;  // release after DavinciModel::Init
  bool load_flag_ = false;
  uint32_t device_id_ = 0;
  uintptr_t global_step_ = 0;
  uintptr_t loop_per_iter_ = 0;
  uintptr_t loop_cond_ = 0;
  ComputeGraphPtr compute_graph_ = nullptr;  // release after DavinciModel::Init
  std::map<OpDescPtr, void *> ref_info_;     // release after DavinciModel::Init
  void *l1_fusion_addr_ = nullptr;


  uint32_t op_debug_task_id_ = 0;
  uint32_t op_debug_stream_id_ = 0;
  void *op_debug_addr_ = nullptr;
@@ -144,20 +131,16 @@ class DataDumper {
  Status DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task);
  Status DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task);
  Status DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Input &input, size_t i,
                       const std::string &node_name_index);
                      const std::string &node_name_index);
  Status ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info);
  void SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id, aicpu::dump::OpMappingInfo &op_mapping_info);
  void SetOpDebugIdToAicpu(uint32_t task_id, uint32_t stream_id, void *op_debug_addr,
                           aicpu::dump::OpMappingInfo &op_mapping_info);
  Status ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info);
  Status GenerateInput(aicpu::dump::Input &input,
                       const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
                       const uintptr_t &addr,
                       size_t index);
  Status GenerateOutput(aicpu::dump::Output &output,
                        const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
                        const uintptr_t &addr,
                        size_t index);
  Status GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
                       const uintptr_t &addr, size_t index);
  Status GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
                        const uintptr_t &addr, size_t index);
  void GenerateOpBuffer(const int64_t &size, aicpu::dump::Task &task);
 };
 struct DataDumper::InnerDumpInfo {
--- a/ge/graph/load/model_manager/davinci_model.cc
+++ b/ge/graph/load/model_manager/davinci_model.cc
@@ -232,6 +232,8 @@ DavinciModel::~DavinciModel() {

      FreeP2PMem();

      OpDebugUnRegister();

      if (l1_fusion_addr_ != nullptr) {
        GE_CHK_RT(rtFree(l1_fusion_addr_));
      }
@@ -242,8 +244,6 @@ DavinciModel::~DavinciModel() {
      }
    }

    OpDebugUnRegister();

    ReleaseTask();
    CleanTbeHandle();

@@ -568,77 +568,21 @@ Status DavinciModel::SetTSDevice() {
 }

 Status DavinciModel::OpDebugRegister() {
  bool is_op_debug = false;
  (void)ge::AttrUtils::GetBool(ge_model_, ATTR_OP_DEBUG_FLAG, is_op_debug);
  GELOGD("The value of op debug in ge_model is %d.", is_op_debug);
  if (is_op_debug) {
    debug_reg_mutex_.lock();
    rtError_t rt_ret = rtMalloc(&op_debug_addr_, kOpDebugMemorySize, RT_MEMORY_DDR);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
      return RT_ERROR_TO_GE_STATUS(rt_ret);
    }

    uint64_t debug_addrs_tmp = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(op_debug_addr_));

    // For data dump, aicpu needs the pointer to pointer that save the real debug address.
    rt_ret = rtMalloc(&p2p_debug_addr_, kDebugP2pSize, RT_MEMORY_HBM);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
      return RT_ERROR_TO_GE_STATUS(rt_ret);
    }
    rt_ret = rtMemcpy(p2p_debug_addr_, sizeof(uint64_t), &debug_addrs_tmp, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "rtMemcpy to p2p_addr error: 0x%X", rt_ret);
      return RT_ERROR_TO_GE_STATUS(rt_ret);
    }

    uint32_t op_debug_mode = 0;
    (void)ge::AttrUtils::GetInt(ge_model_, ATTR_OP_DEBUG_MODE, op_debug_mode);
    GELOGD("The value of op_debug_mode in ge_model_ is %u.", op_debug_mode);
    uint32_t debug_task_id = 0;
    uint32_t debug_stream_id = 0;
    rt_ret = rtDebugRegister(rt_model_handle_, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "rtDebugRegister error, ret: 0x%X", rt_ret);
      return RT_ERROR_TO_GE_STATUS(rt_ret);
  if (GetDumpProperties().IsOpDebugOpen()) {
     uint32_t op_debug_mode = GetDumpProperties().GetOpDebugMode();
    auto ret = opdebug_register_.RegisterDebugForModel(rt_model_handle_, op_debug_mode, data_dumper_);
    if (ret != SUCCESS) {
      GELOGE(ret,"Register known shape op debug failed, ret: 0x%X",ret);
      return ret;
    }
    GELOGI("debug_task_id:%d, debug_stream_id:%u", debug_task_id, debug_stream_id);
    is_op_debug_reg_ = true;

    data_dumper_.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, is_op_debug);
  }

  return SUCCESS;
 }

 void DavinciModel::OpDebugUnRegister() {
  if (is_op_debug_reg_) {
    debug_reg_mutex_.unlock();
    rtError_t rt_ret = RT_ERROR_NONE;
    if (rt_model_handle_ != nullptr) {
      GELOGD("start call debug_unregister.");
      rt_ret = rtDebugUnRegister(rt_model_handle_);
      if (rt_ret != RT_ERROR_NONE) {
        GELOGW("rtDebugUnRegister failed, ret: 0x%X", rt_ret);
      }
    }

    if (op_debug_addr_ != nullptr) {
      rt_ret = rtFree(op_debug_addr_);
      if (rt_ret != RT_ERROR_NONE) {
        GELOGW("rtFree failed, ret: 0x%X", rt_ret);
      }
      op_debug_addr_ = nullptr;
    }

    if (p2p_debug_addr_ != nullptr) {
      rt_ret = rtFree(p2p_debug_addr_);
      if (rt_ret != RT_ERROR_NONE) {
        GELOGW("rtFree failed, ret: 0x%X", rt_ret);
      }
      p2p_debug_addr_ = nullptr;
    }
    opdebug_register_.UnregisterDebugForModel(rt_model_handle_);
    is_op_debug_reg_ = false;
  }
  return;
--- a/ge/graph/load/model_manager/davinci_model.h
+++ b/ge/graph/load/model_manager/davinci_model.h
@@ -29,6 +29,7 @@
 #include "common/helper/om_file_helper.h"
 #include "common/opskernel/ge_task_info.h"
 #include "common/properties_manager.h"
 #include "common/dump/opdebug_register.h"
 #include "common/types.h"
 #include "framework/common/util.h"
 #include "graph/debug/ge_attr_define.h"
@@ -984,6 +985,7 @@ class DavinciModel {
  int64_t maxDumpOpNum_;
  // for data dump
  DataDumper data_dumper_;
  OpdebugRegister opdebug_register_;
  uint64_t iterator_count_;
  bool is_l1_fusion_enable_;
  map<OpDescPtr, void *> saved_task_addrs_;  // release after DavinciModel::Init
@@ -1021,8 +1023,6 @@ class DavinciModel {
  // for op debug
  mutex debug_reg_mutex_;
  bool is_op_debug_reg_ = false;
  void *op_debug_addr_ = nullptr;
  void *p2p_debug_addr_ = nullptr;
  bool is_online_infer_dynamic_ = false;
  bool is_getnext_sink_dynamic_ = false;
  vector<int32_t> cur_dynamic_dims_;
--- a/ge/hybrid/executor/hybrid_model_async_executor.cc
+++ b/ge/hybrid/executor/hybrid_model_async_executor.cc
@@ -85,6 +85,10 @@ Status HybridModelAsyncExecutor::Stop() {
    ret = future_.get();
  }

  if (is_op_debug_reg_) {
    op_debug_register_.UnregisterDebugForStream(stream_);
  }

  if (stream_ != nullptr) {
    GE_CHK_RT(rtStreamDestroy(stream_));
    stream_ = nullptr;
@@ -101,6 +105,7 @@ Status HybridModelAsyncExecutor::Init() {
  executor_ = std::unique_ptr<HybridModelExecutor>(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_));
  GE_CHECK_NOTNULL(executor_);
  GE_CHK_STATUS_RET(executor_->Init(), "Failed to init hybrid engine");
  GE_CHK_STATUS_RET(DumpOpDebug(),"Dump op debug failed in hybrid engine");

  GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups());
  if (model_->GetRootGraphItem()->NumGroups() >= kMinimumPiplineStages) {
@@ -508,5 +513,40 @@ Status HybridModelAsyncExecutor::Execute(const vector<GeTensor> &inputs, vector<

  return SUCCESS;
 }
 Status HybridModelAsyncExecutor::DumpOpDebug() {
  const DumpProperties &dump_properties = executor_->GetContext()->dump_properties;
  if (dump_properties.IsOpDebugOpen()) {
    GELOGD("Opdebug is open in hybrid engine");
    uint32_t op_debug_mode = dump_properties.GetOpDebugMode();
    GE_CHK_RT_RET(op_debug_register_.RegisterDebugForStream(stream_, op_debug_mode, data_dumper_));
    is_op_debug_reg_ = true;
    data_dumper_.SetDumpProperties(dump_properties);
    data_dumper_.SetModelName(model_->GetModelName());
    data_dumper_.SetModelId(model_->GetModelId());
    data_dumper_.SetDeviceId(model_->GetDeviceId());
    void *global_step = nullptr;
    TensorValue *varible_global_step = model_->GetVariable(NODE_NAME_GLOBAL_STEP);
    if (varible_global_step != nullptr) {
      global_step = const_cast<void *>(varible_global_step->GetData());
    }

    void *loop_per_iter = nullptr;
    TensorValue *varible_loop_per_iter = model_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_PER_ITER);
    if (varible_loop_per_iter != nullptr) {
      loop_per_iter = const_cast<void *>(varible_loop_per_iter->GetData());
    }

    void *loop_cond = nullptr;
    TensorValue *varible_loop_cond = model_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_COND);
    if (varible_loop_cond != nullptr) {
      loop_cond = const_cast<void *>(varible_loop_cond->GetData());
    }
    data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond);
    GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "LoadDumpInfo failed in hybrid engine");
    GELOGD("Dump op debug SUCCESS in hybrid engine");
  }
  return SUCCESS;
 }

 }  // namespace hybrid
 }  // namespace ge
--- a/ge/hybrid/executor/hybrid_model_async_executor.h
+++ b/ge/hybrid/executor/hybrid_model_async_executor.h
@@ -21,7 +21,9 @@
 #include <future>
 #include "external/ge/ge_api_error_codes.h"
 #include "external/ge/ge_api_types.h"
 #include "common/dump/opdebug_register.h"
 #include "graph/load/model_manager/data_inputer.h"
 #include "graph/load/model_manager/data_dumper.h"
 #include "hybrid/executor/hybrid_model_executor.h"
 #include "hybrid/executor/hybrid_model_pipeline_executor.h"
 #include "runtime/stream.h"
@@ -77,6 +79,8 @@ class HybridModelAsyncExecutor {

  Status PrepareInputs(const InputData &current_data, HybridModelExecutor::ExecuteArgs &args);

  Status DumpOpDebug();

  std::mutex mu_;
  HybridModel *model_;
  uint32_t device_id_ = 0U;
@@ -94,6 +98,9 @@ class HybridModelAsyncExecutor {
  std::vector<bool> is_input_dynamic_;
  std::shared_ptr<ModelListener> listener_;
  string om_name_;
  DataDumper data_dumper_;
  bool is_op_debug_reg_ = false;
  OpdebugRegister op_debug_register_;
 };
 }  // namespace hybrid
 }  // namespace ge
--- a/ge/hybrid/executor/worker/execution_engine.cc
+++ b/ge/hybrid/executor/worker/execution_engine.cc
@@ -266,9 +266,9 @@ Status NodeDoneCallback::OnNodeDone() {
  RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[Compute] End");
  RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[Callback] Start");

  auto dump_path = context_->GetDumpProperties().GetDumpPath();
  if (!dump_path.empty()) {
    GELOGI("Start to dump dynamic shape,dump_path is %s", dump_path.c_str());
  const DumpProperties &dump_properties = context_->GetDumpProperties();
  if (dump_properties.IsDumpOpen() || context_->IsOverFlow()) {
    GELOGI("Start to dump dynamic shape op");
    GE_CHK_STATUS_RET(DumpDynamicNode(), "Failed to dump dynamic node");
  }

--- a/ge/hybrid/model/hybrid_model.h
+++ b/ge/hybrid/model/hybrid_model.h
@@ -61,6 +61,10 @@ class HybridModel {
    device_id_ = device_id;
  }

  uint32_t GetDeviceId() {
    return device_id_;
  }

  void SetModelId(uint32_t model_id) {
    model_id_ = model_id;
  }
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
@@ -17,6 +17,7 @@
 #include "aicore_node_executor.h"
 #include "framework/common/taskdown_common.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 #include "external/runtime/rt_error_codes.h"

 namespace ge {
 namespace hybrid {
@@ -189,6 +190,7 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()>
    }
    RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] Start");
    GE_CHK_STATUS_RET_NOLOG((*it)->LaunchKernel(context.GetStream()));
    GE_CHK_STATUS_RET_NOLOG(CheckOverflow(context));
    // save profiling data
    uint32_t task_id = 0;
    uint32_t stream_id = 0;
@@ -259,6 +261,25 @@ void AiCoreNodeTask::SetWorkspaceSizes(const vector<int64_t> &workspace_sizes) {
  workspace_sizes_ = workspace_sizes;
 }

 Status AiCoreNodeTask::CheckOverflow(TaskContext &context) {
  const DumpProperties &dump_properties = context.GetDumpProperties();
  if (dump_properties.IsOpDebugOpen()) {
    GELOGD("Op %s is doing overflow check in hybrid engine", context.GetNodeName());
    auto rt_ret = rtStreamSynchronize(context.GetStream());
    if (rt_ret == ACL_ERROR_RT_AICORE_OVER_FLOW) {
      context.SetOverFlow(true);
      GELOGW("Dynamic shape op %s is over flow", context.GetNodeName());
      return SUCCESS;
    } else if (rt_ret != RT_ERROR_NONE) {
      GELOGE(rt_ret, "rtstreamsynchronize failed");
      return RT_ERROR_TO_GE_STATUS(rt_ret);
    }
    return SUCCESS;
  }
  GELOGD("Opdebug is not open in hybrid engine");
  return SUCCESS;
 }

 TaskCompilerFactory &TaskCompilerFactory::GetInstance() {
  static TaskCompilerFactory instance;
  return instance;
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.h
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.h
@@ -62,6 +62,7 @@ class AiCoreNodeTask : public NodeTask {
  const vector<int64_t> &GetWorkspaceSizes() const;
  void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
 private:
  Status CheckOverflow(TaskContext &context);
  std::vector<std::unique_ptr<AiCoreOpTask>> tasks_;
  std::vector<int64_t> workspace_sizes_;
 };
--- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc
+++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc
@@ -124,7 +124,7 @@ Status KnownNodeTask::Init(TaskContext &context) {
  }
  if (!load_flag_) {
    auto dump_properties = context.GetDumpProperties();
    if (dump_properties.IsDumpOpen()) {
    if (dump_properties.IsDumpOpen() || dump_properties.IsOpDebugOpen()) {
      davinci_model_->SetDumpProperties(dump_properties);
      void *global_step = nullptr;
      TensorValue *varible_global_step = context.GetVariable(NODE_NAME_GLOBAL_STEP);
--- a/ge/hybrid/node_executor/task_context.cc
+++ b/ge/hybrid/node_executor/task_context.cc
@@ -350,6 +350,14 @@ void TaskContext::SetStreamId(uint32_t stream_id) {
  stream_id_ = stream_id;
 }

 void TaskContext::SetOverFlow(bool is_over_flow) {
  is_over_flow_ = is_over_flow;
 }

 bool TaskContext::IsOverFlow() {
  return is_over_flow_;
 }

 Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr) {
  GE_CHECK_NOTNULL(buffer);
  if (ori_addr == nullptr) {
--- a/ge/hybrid/node_executor/task_context.h
+++ b/ge/hybrid/node_executor/task_context.h
@@ -65,6 +65,7 @@ class TaskContext {
  int64_t GetSessionId() const;
  uint64_t GetIterationNumber() const;


  void NodeDone();
  void OnError(Status error);

@@ -106,6 +107,9 @@ class TaskContext {
  uint32_t GetStreamId() const;
  void SetStreamId(uint32_t stream_id);

  void SetOverFlow(bool is_over_flow);
  bool IsOverFlow();

  Status Synchronize();

  bool IsForceInferShape() const;
@@ -138,6 +142,7 @@ class TaskContext {
  uint32_t task_id_ = 0;
  uint32_t stream_id_ = 0;
  std::vector<TaskDescInfo> task_desc_info;
  bool is_over_flow_ = false;
 };
 }  // namespace hybrid
 }  // namespace ge
--- a/ge/single_op/task/op_task.cc
+++ b/ge/single_op/task/op_task.cc
@@ -491,21 +491,18 @@ Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) {
  }
  GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape.");

  GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(),
                         aicpu_ext_handle_->GetExtInfoLen(),
                         ext_info_addr_dev_,
                         aicpu_ext_handle_->GetExtInfoLen(),
                         RT_MEMCPY_DEVICE_TO_HOST));
  GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), ext_info_addr_dev_,
                         aicpu_ext_handle_->GetExtInfoLen(), RT_MEMCPY_DEVICE_TO_HOST));

  for (size_t i = 0; i < num_outputs_; ++i) {
    GeShape shape;
    DataType data_type;
    aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type);
    GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]),
                      "AiCpuCCTask Update [%zu]th output shape failed.", i);
    GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), "AiCpuCCTask Update [%zu]th output shape failed.",
                      i);
    if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
      GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]),
                        "AiCpuCCTask Update [%zu]th output desc failed.", i);
      GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "AiCpuCCTask Update [%zu]th output desc failed.",
                        i);
    }
  }
  GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished.");
@@ -697,10 +694,10 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) {
      const auto &shape_hbm = out_shape_hbm_[i];

      uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
      std::unique_ptr<int64_t[]> shape_addr(new(std::nothrow) int64_t[dim_num]());
      std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
      GE_CHECK_NOTNULL(shape_addr);
      GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size,
                             shape_hbm, result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));
      GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm,
                             result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));

      for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
        shape_dims.emplace_back(shape_addr[dim_idx]);
@@ -711,13 +708,14 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) {
    GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
                      "AiCpuTask update [%zu]th output shape failed.", i);
    if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
      GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]),
                        "AiCpuTask update [%zu]th output desc failed.", i);
      GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "AiCpuTask update [%zu]th output desc failed.",
                        i);
    }
  }
  return SUCCESS;
 }


 Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
                                                    vector<DataBuffer> &outputs,
                                                    rtStream_t stream) {
--- a/tests/depends/runtime/src/runtime_stub.cc
+++ b/tests/depends/runtime/src/runtime_stub.cc
@@ -431,3 +431,7 @@ rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId)
 {
 return RT_ERROR_NONE;
 }

 rtError_t rtDebugRegisterForStream(rtStream_t stream, uint32_t flag, const void *addr, uint32_t *streamId, uint32_t *taskId) {
  return RT_ERROR_NONE;
 }
--- a/tests/ut/ge/CMakeLists.txt
+++ b/tests/ut/ge/CMakeLists.txt
@@ -162,6 +162,7 @@ set(COMMON_SRC_FILES
    "${GE_CODE_DIR}/ge/common/dump/dump_properties.cc"
    "${GE_CODE_DIR}/ge/common/helper/model_helper.cc"
    "${GE_CODE_DIR}/ge/common/dump/dump_manager.cc"
    "${GE_CODE_DIR}/ge/common/dump/opdebug_register.cc"
    "${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc"
    "${GE_CODE_DIR}/ge/model/ge_root_model.cc"
    "${GE_CODE_DIR}/ge/common/model_parser/model_parser.cc"
@@ -734,6 +735,7 @@ set(MULTI_PARTS_TEST_FILES
    "graph/transop_util_unittest.cc"
    "common/datatype_transfer_unittest.cc"
    "common/dump_manager_unittest.cc"
    "common/opdebug_register_unittest.cc"
    "common/format_transfer_unittest.cc"
    "common/format_transfer_transpose_unittest.cc"
    "common/format_transfer_nchw_5d_unittest.cc"
--- a/tests/ut/ge/common/opdebug_register_unittest.cc
+++ b/tests/ut/ge/common/opdebug_register_unittest.cc
@@ -0,0 +1,51 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <gtest/gtest.h>

 #include "common/dump/opdebug_register.h"
 #include "common/debug/log.h"
 #include "common/ge_inner_error_codes.h"

 namespace ge {
 class UTEST_opdebug_register : public testing::Test {
 protected:
  void SetUp() {}
  void TearDown() {}
 };
 
 TEST_F(UTEST_opdebug_register, register_debug_for_model_success) {
  OpdebugRegister opdebug_register;
  rtModel_t model_handle = (void*)0x111;
  uint32_t op_debug_mode = 1;
  DataDumper data_dumper;
  auto ret = opdebug_register.RegisterDebugForModel(model_handle, op_debug_mode, data_dumper);
  opdebug_register.UnregisterDebugForModel(model_handle);
  EXPECT_EQ(ret, ge::SUCCESS);
 }

 TEST_F(UTEST_opdebug_register, register_debug_for_stream_success) {
  OpdebugRegister opdebug_register;
  rtStream_t stream = (void*)0x111;
  uint32_t op_debug_mode = 1;
  DataDumper data_dumper;
  auto ret = opdebug_register.RegisterDebugForStream(stream, op_debug_mode, data_dumper);
  opdebug_register.UnregisterDebugForStream(stream);
  EXPECT_EQ(ret, ge::SUCCESS);
 }


 }  // namespace ge