From 10662d550ff5a712d04c503a662b2e9d0aa74363 Mon Sep 17 00:00:00 2001 From: zhou_chao1993 Date: Wed, 3 Mar 2021 11:05:35 +0800 Subject: [PATCH] dynamic shape over flow --- ge/CMakeLists.txt | 2 + ge/common/dump/dump_manager.cc | 8 +- ge/common/dump/dump_op.cc | 6 +- ge/common/dump/dump_properties.h | 2 +- ge/common/dump/opdebug_register.cc | 148 ++++++++++++++++++ ge/common/dump/opdebug_register.h | 44 ++++++ ge/executor/CMakeLists.txt | 1 + ge/graph/load/model_manager/data_dumper.h | 53 +++---- ge/graph/load/model_manager/davinci_model.cc | 74 ++------- ge/graph/load/model_manager/davinci_model.h | 4 +- .../executor/hybrid_model_async_executor.cc | 40 +++++ .../executor/hybrid_model_async_executor.h | 7 + ge/hybrid/executor/worker/execution_engine.cc | 6 +- ge/hybrid/model/hybrid_model.h | 4 + .../aicore/aicore_node_executor.cc | 21 +++ .../aicore/aicore_node_executor.h | 1 + .../compiledsubgraph/known_node_executor.cc | 2 +- ge/hybrid/node_executor/task_context.cc | 8 + ge/hybrid/node_executor/task_context.h | 5 + ge/single_op/task/op_task.cc | 26 ++- tests/depends/runtime/src/runtime_stub.cc | 4 + tests/ut/ge/CMakeLists.txt | 2 + .../ut/ge/common/opdebug_register_unittest.cc | 51 ++++++ 23 files changed, 393 insertions(+), 126 deletions(-) create mode 100644 ge/common/dump/opdebug_register.cc create mode 100644 ge/common/dump/opdebug_register.h create mode 100644 tests/ut/ge/common/opdebug_register_unittest.cc diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt index 93c88cbf..8977ad85 100755 --- a/ge/CMakeLists.txt +++ b/ge/CMakeLists.txt @@ -103,6 +103,7 @@ set(TRAIN_SRC_LIST "common/profiling/profiling_manager.cc" "common/dump/dump_manager.cc" "common/dump/dump_properties.cc" + "common/dump/opdebug_register.cc" "common/dump/dump_op.cc" "common/profiling/ge_profiling.cc" "common/profiling/ge_runner_profiling.cc" @@ -427,6 +428,7 @@ set(INFER_SRC_LIST "common/dump/dump_properties.cc" "common/dump/dump_manager.cc" "common/dump/dump_op.cc" + "common/dump/opdebug_register.cc" "common/dump/dump_server.cc" "common/helper/model_cache_helper.cc" "ge_local_engine/engine/host_cpu_engine.cc" diff --git a/ge/common/dump/dump_manager.cc b/ge/common/dump/dump_manager.cc index 74324059..a659d9c6 100644 --- a/ge/common/dump/dump_manager.cc +++ b/ge/common/dump/dump_manager.cc @@ -104,8 +104,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const DumpProperties &DumpManager::GetDumpProperties( uint64_t session_id) { std::lock_guard lock(mutex_); - // If session_id is not found in dump_properties_map_, operator[] will insert one. - return dump_properties_map_[session_id]; + auto iter = dump_properties_map_.find(session_id); + if (iter != dump_properties_map_.end()) { + return iter->second; + } + static DumpProperties default_properties; + return default_properties; } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpManager::AddDumpProperties( diff --git a/ge/common/dump/dump_op.cc b/ge/common/dump/dump_op.cc index 5c768e22..0becbdc8 100755 --- a/ge/common/dump/dump_op.cc +++ b/ge/common/dump/dump_op.cc @@ -219,9 +219,9 @@ Status DumpOp::LaunchDumpOp() { op_mapping_info.set_dump_path(dump_path); op_mapping_info.set_flag(kAicpuLoadFlag); op_mapping_info.set_dump_step(dump_properties_.GetDumpStep()); - if (!dynamic_model_name_.empty()) { + op_mapping_info.set_model_id(dynamic_model_id_); + if (!dynamic_model_name_.empty() && dump_properties_.IsDumpOpen()) { op_mapping_info.set_model_name(dynamic_model_name_); - op_mapping_info.set_model_id(dynamic_model_id_); } SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info); GELOGI("Dump step is %s ,dump path is %s ,in Launch dump op", dump_properties_.GetDumpStep().c_str(), @@ -253,7 +253,7 @@ Status DumpOp::LaunchDumpOp() { } op_mapping_info.mutable_task()->Add(std::move(task)); } - if (dump_properties_.GetDumpMode() == kDumpAll) { + if (dump_properties_.GetDumpMode() == kDumpAll || dump_properties_.IsOpDebugOpen()) { auto ret = DumpOutput(task); if (ret != SUCCESS) { GELOGE(ret, "Dump output failed when in dumping all"); diff --git a/ge/common/dump/dump_properties.h b/ge/common/dump/dump_properties.h index 67f8c00e..8c064d58 100644 --- a/ge/common/dump/dump_properties.h +++ b/ge/common/dump/dump_properties.h @@ -81,11 +81,11 @@ class DumpProperties { const std::string &GetEnableDumpDebug() const {return enable_dump_debug_;} + private: void CopyFrom(const DumpProperties &other); void SetDumpDebugOptions(); - std::string enable_dump_; std::string enable_dump_debug_; diff --git a/ge/common/dump/opdebug_register.cc b/ge/common/dump/opdebug_register.cc new file mode 100644 index 00000000..340b89e5 --- /dev/null +++ b/ge/common/dump/opdebug_register.cc @@ -0,0 +1,148 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "opdebug_register.h" + +namespace { +const size_t kOpDebugMemorySize = 2048UL; +const size_t kDebugP2pSize = 8UL; +} // namespace +namespace ge { +OpdebugRegister::~OpdebugRegister() {} + +Status OpdebugRegister::RegisterDebugForModel(rtModel_t model_handle, uint32_t op_debug_mode, DataDumper &data_dumper) { + GELOGD("Start to register debug for model in overflow"); + auto ret = MallocMemForOpdebug(); + if (ret != SUCCESS) { + GELOGE(ret, "Malloc memory for opdebug in model overflow failed ,ret:0x%X", ret); + return ret; + } + uint32_t debug_stream_id = 0; + uint32_t debug_task_id = 0; + auto rt_ret = rtDebugRegister(model_handle, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "rtDebugRegister error, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + GELOGD("debug_task_id:%u, debug_stream_id:%u in model overflow", debug_task_id, debug_stream_id); + data_dumper.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, true); + return SUCCESS; +} + +void OpdebugRegister::UnregisterDebugForModel(rtModel_t model_handle) { + rtError_t rt_ret = RT_ERROR_NONE; + if (model_handle != nullptr) { + GELOGD("start to call rtDebugUnRegister in model overflow."); + rt_ret = rtDebugUnRegister(model_handle); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("rtDebugUnRegister failed, ret: 0x%X", rt_ret); + } + } + + if (op_debug_addr_ != nullptr) { + rt_ret = rtFree(op_debug_addr_); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("rtFree failed, ret: 0x%X", rt_ret); + } + op_debug_addr_ = nullptr; + } + + if (p2p_debug_addr_ != nullptr) { + rt_ret = rtFree(p2p_debug_addr_); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("rtFree failed, ret: 0x%X", rt_ret); + } + p2p_debug_addr_ = nullptr; + } + return; +} + +Status OpdebugRegister::RegisterDebugForStream(rtStream_t stream, uint32_t op_debug_mode, DataDumper &data_dumper) { + GELOGD("Start to register debug for stream in stream overflow"); + auto ret = MallocMemForOpdebug(); + if (ret != SUCCESS) { + GELOGE(ret, "Malloc memory for opdebug in stream overflow ,ret:0x%X", ret); + return ret; + } + + uint32_t debug_stream_id = 0; + uint32_t debug_task_id = 0; +#ifdef ONLY_COMPILE_OPEN_SRC + auto rt_ret = rtDebugRegisterForStream(stream, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "rtDebugRegisterForStream error, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } +#endif + GELOGD("debug_task_id:%u, debug_stream_id:%u in stream overflow.", debug_task_id, debug_stream_id); + data_dumper.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, true); + return SUCCESS; +} + +void OpdebugRegister::UnregisterDebugForStream(rtStream_t stream) { + rtError_t rt_ret = RT_ERROR_NONE; +#ifdef ONLY_COMPILE_OPEN_SRC + if (stream != nullptr) { + GELOGD("start call rtDebugUnRegisterForStream in unknown shape over flow."); + rt_ret = rtDebugUnRegisterForStream(stream); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("rtDebugUnRegisterForStream failed, ret: 0x%X", rt_ret); + } + } +#endif + + if (op_debug_addr_ != nullptr) { + rt_ret = rtFree(op_debug_addr_); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("rtFree failed, ret: 0x%X", rt_ret); + } + op_debug_addr_ = nullptr; + } + + if (p2p_debug_addr_ != nullptr) { + rt_ret = rtFree(p2p_debug_addr_); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("rtFree failed, ret: 0x%X", rt_ret); + } + p2p_debug_addr_ = nullptr; + } + return; +} + +Status OpdebugRegister::MallocMemForOpdebug() { + rtError_t rt_ret = rtMalloc(&op_debug_addr_, kOpDebugMemorySize, RT_MEMORY_DDR); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + + uint64_t debug_addrs_tmp = static_cast(reinterpret_cast(op_debug_addr_)); + // For data dump, aicpu needs the pointer to pointer that save the real debug address. + rt_ret = rtMalloc(&p2p_debug_addr_, kDebugP2pSize, RT_MEMORY_HBM); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + rt_ret = rtMemcpy(p2p_debug_addr_, sizeof(uint64_t), &debug_addrs_tmp, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "rtMemcpy to p2p_addr error: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + + return SUCCESS; +} + +} // namespace ge \ No newline at end of file diff --git a/ge/common/dump/opdebug_register.h b/ge/common/dump/opdebug_register.h new file mode 100644 index 00000000..1826287d --- /dev/null +++ b/ge/common/dump/opdebug_register.h @@ -0,0 +1,44 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_DUMP_OPDEBUG_REGISTER_H_ +#define GE_COMMON_DUMP_OPDEBUG_REGISTER_H_ + +#include +#include "common/debug/ge_log.h" +#include "common/debug/log.h" +#include "graph/load/model_manager/data_dumper.h" + +namespace ge { +class OpdebugRegister { + public: + OpdebugRegister() = default; + ~OpdebugRegister(); + + Status RegisterDebugForModel(rtModel_t model_handle, uint32_t op_debug_mode, DataDumper &data_dumper); + void UnregisterDebugForModel(rtModel_t model_handle); + + Status RegisterDebugForStream(rtStream_t stream, uint32_t op_debug_mode, DataDumper &data_dumper); + void UnregisterDebugForStream(rtStream_t stream); + + private: + Status MallocMemForOpdebug(); + + void *op_debug_addr_ = nullptr; + void *p2p_debug_addr_ = nullptr; +}; +} // namespace ge +#endif // GE_COMMON_DUMP_OPDEBUG_REGISTER_H_ diff --git a/ge/executor/CMakeLists.txt b/ge/executor/CMakeLists.txt index 31cbad7a..04654f99 100644 --- a/ge/executor/CMakeLists.txt +++ b/ge/executor/CMakeLists.txt @@ -17,6 +17,7 @@ set(SRC_LIST "../common/dump/dump_properties.cc" "../common/dump/dump_manager.cc" "../common/dump/dump_op.cc" + "../common/dump/opdebug_register.cc" "../common/profiling/ge_profiling.cc" "../graph/load/graph_loader.cc" "../graph/execute/graph_execute.cc" diff --git a/ge/graph/load/model_manager/data_dumper.h b/ge/graph/load/model_manager/data_dumper.h index 8e612688..fbe70cf0 100755 --- a/ge/graph/load/model_manager/data_dumper.h +++ b/ge/graph/load/model_manager/data_dumper.h @@ -36,21 +36,9 @@ namespace ge { class DataDumper { public: - explicit DataDumper(const RuntimeParam &rsh) - : model_name_(), - model_id_(0), - runtime_param_(rsh), - dev_mem_load_(nullptr), - dev_mem_unload_(nullptr), - op_list_(), - input_map_(), - load_flag_(false), - device_id_(0), - global_step_(0), - loop_per_iter_(0), - loop_cond_(0), - compute_graph_(nullptr), - ref_info_() {} + DataDumper() : runtime_param_{} {} + + explicit DataDumper(const RuntimeParam &rsh) : runtime_param_(rsh) {} ~DataDumper(); @@ -105,10 +93,10 @@ class DataDumper { // for inference data dump std::string om_name_; - uint32_t model_id_; + uint32_t model_id_ = 0; const RuntimeParam &runtime_param_; - void *dev_mem_load_; - void *dev_mem_unload_; + void *dev_mem_load_ = nullptr; + void *dev_mem_unload_ = nullptr; struct InnerDumpInfo; struct InnerInputMapping; @@ -119,16 +107,15 @@ class DataDumper { uint32_t end_graph_stream_id_ = 0; bool is_end_graph_ = false; std::multimap input_map_; // release after DavinciModel::Init - bool load_flag_; - uint32_t device_id_; - uintptr_t global_step_; - uintptr_t loop_per_iter_; - uintptr_t loop_cond_; - ComputeGraphPtr compute_graph_; // release after DavinciModel::Init - std::map ref_info_; // release after DavinciModel::Init + bool load_flag_ = false; + uint32_t device_id_ = 0; + uintptr_t global_step_ = 0; + uintptr_t loop_per_iter_ = 0; + uintptr_t loop_cond_ = 0; + ComputeGraphPtr compute_graph_ = nullptr; // release after DavinciModel::Init + std::map ref_info_; // release after DavinciModel::Init void *l1_fusion_addr_ = nullptr; - uint32_t op_debug_task_id_ = 0; uint32_t op_debug_stream_id_ = 0; void *op_debug_addr_ = nullptr; @@ -144,20 +131,16 @@ class DataDumper { Status DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task); Status DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task); Status DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Input &input, size_t i, - const std::string &node_name_index); + const std::string &node_name_index); Status ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info); void SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id, aicpu::dump::OpMappingInfo &op_mapping_info); void SetOpDebugIdToAicpu(uint32_t task_id, uint32_t stream_id, void *op_debug_addr, aicpu::dump::OpMappingInfo &op_mapping_info); Status ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info); - Status GenerateInput(aicpu::dump::Input &input, - const OpDesc::Vistor &tensor_descs, - const uintptr_t &addr, - size_t index); - Status GenerateOutput(aicpu::dump::Output &output, - const OpDesc::Vistor &tensor_descs, - const uintptr_t &addr, - size_t index); + Status GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor &tensor_descs, + const uintptr_t &addr, size_t index); + Status GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vistor &tensor_descs, + const uintptr_t &addr, size_t index); void GenerateOpBuffer(const int64_t &size, aicpu::dump::Task &task); }; struct DataDumper::InnerDumpInfo { diff --git a/ge/graph/load/model_manager/davinci_model.cc b/ge/graph/load/model_manager/davinci_model.cc index b7bb97ce..c2ba4bf4 100755 --- a/ge/graph/load/model_manager/davinci_model.cc +++ b/ge/graph/load/model_manager/davinci_model.cc @@ -232,6 +232,8 @@ DavinciModel::~DavinciModel() { FreeP2PMem(); + OpDebugUnRegister(); + if (l1_fusion_addr_ != nullptr) { GE_CHK_RT(rtFree(l1_fusion_addr_)); } @@ -242,8 +244,6 @@ DavinciModel::~DavinciModel() { } } - OpDebugUnRegister(); - ReleaseTask(); CleanTbeHandle(); @@ -568,77 +568,21 @@ Status DavinciModel::SetTSDevice() { } Status DavinciModel::OpDebugRegister() { - bool is_op_debug = false; - (void)ge::AttrUtils::GetBool(ge_model_, ATTR_OP_DEBUG_FLAG, is_op_debug); - GELOGD("The value of op debug in ge_model is %d.", is_op_debug); - if (is_op_debug) { - debug_reg_mutex_.lock(); - rtError_t rt_ret = rtMalloc(&op_debug_addr_, kOpDebugMemorySize, RT_MEMORY_DDR); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); - return RT_ERROR_TO_GE_STATUS(rt_ret); - } - - uint64_t debug_addrs_tmp = static_cast(reinterpret_cast(op_debug_addr_)); - - // For data dump, aicpu needs the pointer to pointer that save the real debug address. - rt_ret = rtMalloc(&p2p_debug_addr_, kDebugP2pSize, RT_MEMORY_HBM); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); - return RT_ERROR_TO_GE_STATUS(rt_ret); - } - rt_ret = rtMemcpy(p2p_debug_addr_, sizeof(uint64_t), &debug_addrs_tmp, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtMemcpy to p2p_addr error: 0x%X", rt_ret); - return RT_ERROR_TO_GE_STATUS(rt_ret); - } - - uint32_t op_debug_mode = 0; - (void)ge::AttrUtils::GetInt(ge_model_, ATTR_OP_DEBUG_MODE, op_debug_mode); - GELOGD("The value of op_debug_mode in ge_model_ is %u.", op_debug_mode); - uint32_t debug_task_id = 0; - uint32_t debug_stream_id = 0; - rt_ret = rtDebugRegister(rt_model_handle_, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtDebugRegister error, ret: 0x%X", rt_ret); - return RT_ERROR_TO_GE_STATUS(rt_ret); + if (GetDumpProperties().IsOpDebugOpen()) { + uint32_t op_debug_mode = GetDumpProperties().GetOpDebugMode(); + auto ret = opdebug_register_.RegisterDebugForModel(rt_model_handle_, op_debug_mode, data_dumper_); + if (ret != SUCCESS) { + GELOGE(ret,"Register known shape op debug failed, ret: 0x%X",ret); + return ret; } - GELOGI("debug_task_id:%d, debug_stream_id:%u", debug_task_id, debug_stream_id); is_op_debug_reg_ = true; - - data_dumper_.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, is_op_debug); } - return SUCCESS; } void DavinciModel::OpDebugUnRegister() { if (is_op_debug_reg_) { - debug_reg_mutex_.unlock(); - rtError_t rt_ret = RT_ERROR_NONE; - if (rt_model_handle_ != nullptr) { - GELOGD("start call debug_unregister."); - rt_ret = rtDebugUnRegister(rt_model_handle_); - if (rt_ret != RT_ERROR_NONE) { - GELOGW("rtDebugUnRegister failed, ret: 0x%X", rt_ret); - } - } - - if (op_debug_addr_ != nullptr) { - rt_ret = rtFree(op_debug_addr_); - if (rt_ret != RT_ERROR_NONE) { - GELOGW("rtFree failed, ret: 0x%X", rt_ret); - } - op_debug_addr_ = nullptr; - } - - if (p2p_debug_addr_ != nullptr) { - rt_ret = rtFree(p2p_debug_addr_); - if (rt_ret != RT_ERROR_NONE) { - GELOGW("rtFree failed, ret: 0x%X", rt_ret); - } - p2p_debug_addr_ = nullptr; - } + opdebug_register_.UnregisterDebugForModel(rt_model_handle_); is_op_debug_reg_ = false; } return; diff --git a/ge/graph/load/model_manager/davinci_model.h b/ge/graph/load/model_manager/davinci_model.h index 4e29a4f4..70c0f687 100755 --- a/ge/graph/load/model_manager/davinci_model.h +++ b/ge/graph/load/model_manager/davinci_model.h @@ -29,6 +29,7 @@ #include "common/helper/om_file_helper.h" #include "common/opskernel/ge_task_info.h" #include "common/properties_manager.h" +#include "common/dump/opdebug_register.h" #include "common/types.h" #include "framework/common/util.h" #include "graph/debug/ge_attr_define.h" @@ -984,6 +985,7 @@ class DavinciModel { int64_t maxDumpOpNum_; // for data dump DataDumper data_dumper_; + OpdebugRegister opdebug_register_; uint64_t iterator_count_; bool is_l1_fusion_enable_; map saved_task_addrs_; // release after DavinciModel::Init @@ -1021,8 +1023,6 @@ class DavinciModel { // for op debug mutex debug_reg_mutex_; bool is_op_debug_reg_ = false; - void *op_debug_addr_ = nullptr; - void *p2p_debug_addr_ = nullptr; bool is_online_infer_dynamic_ = false; bool is_getnext_sink_dynamic_ = false; vector cur_dynamic_dims_; diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index 7d163130..b6c4dc9e 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -85,6 +85,10 @@ Status HybridModelAsyncExecutor::Stop() { ret = future_.get(); } + if (is_op_debug_reg_) { + op_debug_register_.UnregisterDebugForStream(stream_); + } + if (stream_ != nullptr) { GE_CHK_RT(rtStreamDestroy(stream_)); stream_ = nullptr; @@ -101,6 +105,7 @@ Status HybridModelAsyncExecutor::Init() { executor_ = std::unique_ptr(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_)); GE_CHECK_NOTNULL(executor_); GE_CHK_STATUS_RET(executor_->Init(), "Failed to init hybrid engine"); + GE_CHK_STATUS_RET(DumpOpDebug(),"Dump op debug failed in hybrid engine"); GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups()); if (model_->GetRootGraphItem()->NumGroups() >= kMinimumPiplineStages) { @@ -508,5 +513,40 @@ Status HybridModelAsyncExecutor::Execute(const vector &inputs, vector< return SUCCESS; } +Status HybridModelAsyncExecutor::DumpOpDebug() { + const DumpProperties &dump_properties = executor_->GetContext()->dump_properties; + if (dump_properties.IsOpDebugOpen()) { + GELOGD("Opdebug is open in hybrid engine"); + uint32_t op_debug_mode = dump_properties.GetOpDebugMode(); + GE_CHK_RT_RET(op_debug_register_.RegisterDebugForStream(stream_, op_debug_mode, data_dumper_)); + is_op_debug_reg_ = true; + data_dumper_.SetDumpProperties(dump_properties); + data_dumper_.SetModelName(model_->GetModelName()); + data_dumper_.SetModelId(model_->GetModelId()); + data_dumper_.SetDeviceId(model_->GetDeviceId()); + void *global_step = nullptr; + TensorValue *varible_global_step = model_->GetVariable(NODE_NAME_GLOBAL_STEP); + if (varible_global_step != nullptr) { + global_step = const_cast(varible_global_step->GetData()); + } + + void *loop_per_iter = nullptr; + TensorValue *varible_loop_per_iter = model_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_PER_ITER); + if (varible_loop_per_iter != nullptr) { + loop_per_iter = const_cast(varible_loop_per_iter->GetData()); + } + + void *loop_cond = nullptr; + TensorValue *varible_loop_cond = model_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_COND); + if (varible_loop_cond != nullptr) { + loop_cond = const_cast(varible_loop_cond->GetData()); + } + data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond); + GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "LoadDumpInfo failed in hybrid engine"); + GELOGD("Dump op debug SUCCESS in hybrid engine"); + } + return SUCCESS; +} + } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/executor/hybrid_model_async_executor.h b/ge/hybrid/executor/hybrid_model_async_executor.h index 4790248b..69d8a3f4 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.h +++ b/ge/hybrid/executor/hybrid_model_async_executor.h @@ -21,7 +21,9 @@ #include #include "external/ge/ge_api_error_codes.h" #include "external/ge/ge_api_types.h" +#include "common/dump/opdebug_register.h" #include "graph/load/model_manager/data_inputer.h" +#include "graph/load/model_manager/data_dumper.h" #include "hybrid/executor/hybrid_model_executor.h" #include "hybrid/executor/hybrid_model_pipeline_executor.h" #include "runtime/stream.h" @@ -77,6 +79,8 @@ class HybridModelAsyncExecutor { Status PrepareInputs(const InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args); + Status DumpOpDebug(); + std::mutex mu_; HybridModel *model_; uint32_t device_id_ = 0U; @@ -94,6 +98,9 @@ class HybridModelAsyncExecutor { std::vector is_input_dynamic_; std::shared_ptr listener_; string om_name_; + DataDumper data_dumper_; + bool is_op_debug_reg_ = false; + OpdebugRegister op_debug_register_; }; } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index 63d9126b..673c82dd 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -266,9 +266,9 @@ Status NodeDoneCallback::OnNodeDone() { RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[Compute] End"); RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[Callback] Start"); - auto dump_path = context_->GetDumpProperties().GetDumpPath(); - if (!dump_path.empty()) { - GELOGI("Start to dump dynamic shape,dump_path is %s", dump_path.c_str()); + const DumpProperties &dump_properties = context_->GetDumpProperties(); + if (dump_properties.IsDumpOpen() || context_->IsOverFlow()) { + GELOGI("Start to dump dynamic shape op"); GE_CHK_STATUS_RET(DumpDynamicNode(), "Failed to dump dynamic node"); } diff --git a/ge/hybrid/model/hybrid_model.h b/ge/hybrid/model/hybrid_model.h index 500f0472..3e5bd635 100644 --- a/ge/hybrid/model/hybrid_model.h +++ b/ge/hybrid/model/hybrid_model.h @@ -61,6 +61,10 @@ class HybridModel { device_id_ = device_id; } + uint32_t GetDeviceId() { + return device_id_; + } + void SetModelId(uint32_t model_id) { model_id_ = model_id; } diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc index 1640ad3b..119db0af 100755 --- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc +++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc @@ -17,6 +17,7 @@ #include "aicore_node_executor.h" #include "framework/common/taskdown_common.h" #include "hybrid/executor/hybrid_execution_context.h" +#include "external/runtime/rt_error_codes.h" namespace ge { namespace hybrid { @@ -189,6 +190,7 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function } RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] Start"); GE_CHK_STATUS_RET_NOLOG((*it)->LaunchKernel(context.GetStream())); + GE_CHK_STATUS_RET_NOLOG(CheckOverflow(context)); // save profiling data uint32_t task_id = 0; uint32_t stream_id = 0; @@ -259,6 +261,25 @@ void AiCoreNodeTask::SetWorkspaceSizes(const vector &workspace_sizes) { workspace_sizes_ = workspace_sizes; } +Status AiCoreNodeTask::CheckOverflow(TaskContext &context) { + const DumpProperties &dump_properties = context.GetDumpProperties(); + if (dump_properties.IsOpDebugOpen()) { + GELOGD("Op %s is doing overflow check in hybrid engine", context.GetNodeName()); + auto rt_ret = rtStreamSynchronize(context.GetStream()); + if (rt_ret == ACL_ERROR_RT_AICORE_OVER_FLOW) { + context.SetOverFlow(true); + GELOGW("Dynamic shape op %s is over flow", context.GetNodeName()); + return SUCCESS; + } else if (rt_ret != RT_ERROR_NONE) { + GELOGE(rt_ret, "rtstreamsynchronize failed"); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + return SUCCESS; + } + GELOGD("Opdebug is not open in hybrid engine"); + return SUCCESS; +} + TaskCompilerFactory &TaskCompilerFactory::GetInstance() { static TaskCompilerFactory instance; return instance; diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.h b/ge/hybrid/node_executor/aicore/aicore_node_executor.h index 2095b41d..c352764d 100755 --- a/ge/hybrid/node_executor/aicore/aicore_node_executor.h +++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.h @@ -62,6 +62,7 @@ class AiCoreNodeTask : public NodeTask { const vector &GetWorkspaceSizes() const; void SetWorkspaceSizes(const vector &workspace_sizes); private: + Status CheckOverflow(TaskContext &context); std::vector> tasks_; std::vector workspace_sizes_; }; diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc index 1d6e814b..cf5ac851 100755 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc @@ -124,7 +124,7 @@ Status KnownNodeTask::Init(TaskContext &context) { } if (!load_flag_) { auto dump_properties = context.GetDumpProperties(); - if (dump_properties.IsDumpOpen()) { + if (dump_properties.IsDumpOpen() || dump_properties.IsOpDebugOpen()) { davinci_model_->SetDumpProperties(dump_properties); void *global_step = nullptr; TensorValue *varible_global_step = context.GetVariable(NODE_NAME_GLOBAL_STEP); diff --git a/ge/hybrid/node_executor/task_context.cc b/ge/hybrid/node_executor/task_context.cc index 08cce30c..84dd8fd8 100644 --- a/ge/hybrid/node_executor/task_context.cc +++ b/ge/hybrid/node_executor/task_context.cc @@ -350,6 +350,14 @@ void TaskContext::SetStreamId(uint32_t stream_id) { stream_id_ = stream_id; } +void TaskContext::SetOverFlow(bool is_over_flow) { + is_over_flow_ = is_over_flow; +} + +bool TaskContext::IsOverFlow() { + return is_over_flow_; +} + Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr) { GE_CHECK_NOTNULL(buffer); if (ori_addr == nullptr) { diff --git a/ge/hybrid/node_executor/task_context.h b/ge/hybrid/node_executor/task_context.h index 645c1234..e00c5048 100644 --- a/ge/hybrid/node_executor/task_context.h +++ b/ge/hybrid/node_executor/task_context.h @@ -65,6 +65,7 @@ class TaskContext { int64_t GetSessionId() const; uint64_t GetIterationNumber() const; + void NodeDone(); void OnError(Status error); @@ -106,6 +107,9 @@ class TaskContext { uint32_t GetStreamId() const; void SetStreamId(uint32_t stream_id); + void SetOverFlow(bool is_over_flow); + bool IsOverFlow(); + Status Synchronize(); bool IsForceInferShape() const; @@ -138,6 +142,7 @@ class TaskContext { uint32_t task_id_ = 0; uint32_t stream_id_ = 0; std::vector task_desc_info; + bool is_over_flow_ = false; }; } // namespace hybrid } // namespace ge diff --git a/ge/single_op/task/op_task.cc b/ge/single_op/task/op_task.cc index 80c16968..f754af28 100755 --- a/ge/single_op/task/op_task.cc +++ b/ge/single_op/task/op_task.cc @@ -491,21 +491,18 @@ Status AiCpuBaseTask::UpdateOutputShape(vector &output_desc) { } GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape."); - GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), - aicpu_ext_handle_->GetExtInfoLen(), - ext_info_addr_dev_, - aicpu_ext_handle_->GetExtInfoLen(), - RT_MEMCPY_DEVICE_TO_HOST)); + GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), ext_info_addr_dev_, + aicpu_ext_handle_->GetExtInfoLen(), RT_MEMCPY_DEVICE_TO_HOST)); for (size_t i = 0; i < num_outputs_; ++i) { GeShape shape; DataType data_type; aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type); - GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), - "AiCpuCCTask Update [%zu]th output shape failed.", i); + GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), "AiCpuCCTask Update [%zu]th output shape failed.", + i); if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) { - GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), - "AiCpuCCTask Update [%zu]th output desc failed.", i); + GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "AiCpuCCTask Update [%zu]th output desc failed.", + i); } } GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished."); @@ -697,10 +694,10 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector &output_desc) { const auto &shape_hbm = out_shape_hbm_[i]; uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t); - std::unique_ptr shape_addr(new(std::nothrow) int64_t[dim_num]()); + std::unique_ptr shape_addr(new (std::nothrow) int64_t[dim_num]()); GE_CHECK_NOTNULL(shape_addr); - GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, - shape_hbm, result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST)); + GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm, + result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST)); for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) { shape_dims.emplace_back(shape_addr[dim_idx]); @@ -711,13 +708,14 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector &output_desc) { GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]), "AiCpuTask update [%zu]th output shape failed.", i); if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) { - GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), - "AiCpuTask update [%zu]th output desc failed.", i); + GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "AiCpuTask update [%zu]th output desc failed.", + i); } } return SUCCESS; } + Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector &output_desc, vector &outputs, rtStream_t stream) { diff --git a/tests/depends/runtime/src/runtime_stub.cc b/tests/depends/runtime/src/runtime_stub.cc index e6a7d66b..440b98e7 100644 --- a/tests/depends/runtime/src/runtime_stub.cc +++ b/tests/depends/runtime/src/runtime_stub.cc @@ -431,3 +431,7 @@ rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId) { return RT_ERROR_NONE; } + +rtError_t rtDebugRegisterForStream(rtStream_t stream, uint32_t flag, const void *addr, uint32_t *streamId, uint32_t *taskId) { + return RT_ERROR_NONE; +} diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt index 91b756cc..f87b09aa 100755 --- a/tests/ut/ge/CMakeLists.txt +++ b/tests/ut/ge/CMakeLists.txt @@ -162,6 +162,7 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/common/dump/dump_properties.cc" "${GE_CODE_DIR}/ge/common/helper/model_helper.cc" "${GE_CODE_DIR}/ge/common/dump/dump_manager.cc" + "${GE_CODE_DIR}/ge/common/dump/opdebug_register.cc" "${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc" "${GE_CODE_DIR}/ge/model/ge_root_model.cc" "${GE_CODE_DIR}/ge/common/model_parser/model_parser.cc" @@ -733,6 +734,7 @@ set(MULTI_PARTS_TEST_FILES "graph/transop_util_unittest.cc" "common/datatype_transfer_unittest.cc" "common/dump_manager_unittest.cc" + "common/opdebug_register_unittest.cc" "common/format_transfer_unittest.cc" "common/format_transfer_transpose_unittest.cc" "common/format_transfer_nchw_5d_unittest.cc" diff --git a/tests/ut/ge/common/opdebug_register_unittest.cc b/tests/ut/ge/common/opdebug_register_unittest.cc new file mode 100644 index 00000000..fcdaddaf --- /dev/null +++ b/tests/ut/ge/common/opdebug_register_unittest.cc @@ -0,0 +1,51 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "common/dump/opdebug_register.h" +#include "common/debug/log.h" +#include "common/ge_inner_error_codes.h" + +namespace ge { +class UTEST_opdebug_register : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +TEST_F(UTEST_opdebug_register, register_debug_for_model_success) { + OpdebugRegister opdebug_register; + rtModel_t model_handle = (void*)0x111; + uint32_t op_debug_mode = 1; + DataDumper data_dumper; + auto ret = opdebug_register.RegisterDebugForModel(model_handle, op_debug_mode, data_dumper); + opdebug_register.UnregisterDebugForModel(model_handle); + EXPECT_EQ(ret, ge::SUCCESS); +} + +TEST_F(UTEST_opdebug_register, register_debug_for_stream_success) { + OpdebugRegister opdebug_register; + rtStream_t stream = (void*)0x111; + uint32_t op_debug_mode = 1; + DataDumper data_dumper; + auto ret = opdebug_register.RegisterDebugForStream(stream, op_debug_mode, data_dumper); + opdebug_register.UnregisterDebugForStream(stream); + EXPECT_EQ(ret, ge::SUCCESS); +} + + +} // namespace ge \ No newline at end of file