From 2328aa0ef4e3cd663ef631bcda27001a5791b211 Mon Sep 17 00:00:00 2001 From: zhengyuanhua Date: Fri, 9 Apr 2021 15:49:30 +0800 Subject: [PATCH] aic error --- ge/CMakeLists.txt | 2 + ge/common/debug/memory_dumper.cc | 2 +- ge/common/dump/exception_dumper.cc | 241 ++++++++++++++++++ ge/common/dump/exception_dumper.h | 48 ++++ ge/executor/CMakeLists.txt | 1 + ge/graph/load/model_manager/data_dumper.cc | 172 ------------- ge/graph/load/model_manager/data_dumper.h | 8 - ge/graph/load/model_manager/davinci_model.cc | 37 ++- ge/graph/load/model_manager/davinci_model.h | 11 +- ge/graph/load/model_manager/model_manager.cc | 18 +- ge/graph/load/model_manager/model_manager.h | 2 + .../task_info/kernel_ex_task_info.cc | 1 + .../task_info/kernel_task_info.cc | 1 + .../executor/hybrid_execution_context.cc | 22 ++ ge/hybrid/executor/hybrid_execution_context.h | 4 + .../executor/hybrid_model_async_executor.h | 2 + ge/hybrid/executor/hybrid_model_executor.cc | 13 +- .../hybrid_model_pipeline_executor.cc | 8 + ge/hybrid/executor/worker/execution_engine.cc | 42 +++ ge/hybrid/hybrid_davinci_model.cc | 28 ++ ge/hybrid/hybrid_davinci_model.h | 4 + ge/hybrid/hybrid_davinci_model_stub.cc | 8 + .../aicore/aicore_node_executor.cc | 2 + .../aicpu/aicpu_node_executor.cc | 2 + .../compiledsubgraph/known_node_executor.cc | 8 + .../compiledsubgraph/known_node_executor.h | 1 + parser | 2 +- tests/ut/ge/CMakeLists.txt | 2 + tests/ut/ge/common/dump_exception_unittest.cc | 54 ++++ .../ge/graph/load/davinci_model_unittest.cc | 12 + 30 files changed, 569 insertions(+), 189 deletions(-) create mode 100644 ge/common/dump/exception_dumper.cc create mode 100644 ge/common/dump/exception_dumper.h create mode 100644 tests/ut/ge/common/dump_exception_unittest.cc diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt index d84bb89a..89745019 100755 --- a/ge/CMakeLists.txt +++ b/ge/CMakeLists.txt @@ -108,6 +108,7 @@ set(TRAIN_SRC_LIST "common/helper/model_cache_helper.cc" "common/profiling/profiling_manager.cc" "common/dump/dump_manager.cc" + "common/dump/exception_dumper.cc" "common/dump/dump_properties.cc" "common/dump/opdebug_register.cc" "common/dump/dump_op.cc" @@ -437,6 +438,7 @@ set(INFER_SRC_LIST "common/formats/formats.cc" "common/profiling/profiling_manager.cc" "common/dump/dump_properties.cc" + "common/dump/exception_dumper.cc" "common/dump/dump_manager.cc" "common/dump/dump_op.cc" "common/dump/opdebug_register.cc" diff --git a/ge/common/debug/memory_dumper.cc b/ge/common/debug/memory_dumper.cc index e19d9a95..668cf2ae 100644 --- a/ge/common/debug/memory_dumper.cc +++ b/ge/common/debug/memory_dumper.cc @@ -161,7 +161,7 @@ int MemoryDumper::OpenFile(const char *filename) { // Using the O_EXCL, if the file already exists,return failed to avoid privilege escalation vulnerability. mmMode_t mode = M_IRUSR | M_IWUSR; - int32_t fd = mmOpen2(real_path.c_str(), M_RDWR | M_CREAT | O_TRUNC, mode); + int32_t fd = mmOpen2(real_path.c_str(), M_RDWR | M_CREAT | M_APPEND, mode); if (fd == EN_ERROR || fd == EN_INVALID_PARAM) { GELOGE(kInvalidFd, "[Open][File]Failed. errno = %d, error:%s, filename:%s.", fd, strerror(errno), filename); diff --git a/ge/common/dump/exception_dumper.cc b/ge/common/dump/exception_dumper.cc new file mode 100644 index 00000000..bed389a7 --- /dev/null +++ b/ge/common/dump/exception_dumper.cc @@ -0,0 +1,241 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/dump/exception_dumper.h" + +#include "common/ge/datatype_util.h" +#include "common/debug/memory_dumper.h" +#include "framework/common/debug/log.h" +#include "graph/manager/util/debug.h" +#include "graph/utils/tensor_utils.h" +#include "graph/load/model_manager/model_utils.h" +#include "proto/dump_task.pb.h" + +namespace { +static uint64_t GetNowTime() { + uint64_t ret = 0; + mmTimeval tv; + if (mmGetTimeOfDay(&tv, nullptr) == 0) { + ret = tv.tv_sec * 1000000ULL + tv.tv_usec; + } + + return ret; +} + +static void ReplaceStringElem(std::string &str) { + for_each(str.begin(), str.end(), [](char &ch) { + if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) { + ch = '_'; + } + }); +} + +static void SetDumpData(const ge::OpDescInfo &op_desc_info, toolkit::dumpdata::DumpData &dump_data) { + dump_data.set_version("2.0"); + dump_data.set_dump_time(GetNowTime()); + dump_data.set_op_name(op_desc_info.op_name); + for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) { + toolkit::dumpdata::OpInput input; + input.set_data_type(toolkit::dumpdata::OutputDataType( + ge::DataTypeUtil::GetIrDataType(op_desc_info.input_data_type[i]))); + input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i])); + for (auto dim : op_desc_info.input_shape[i]) { + input.mutable_shape()->add_dim(dim); + } + input.set_size(op_desc_info.input_size[i]); + GELOGI("[Set][DumpData] The input size int exception is %ld", op_desc_info.input_size[i]); + dump_data.mutable_input()->Add(std::move(input)); + } + + for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) { + toolkit::dumpdata::OpOutput output; + output.set_data_type(toolkit::dumpdata::OutputDataType( + ge::DataTypeUtil::GetIrDataType(op_desc_info.output_data_type[j]))); + output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j])); + for (auto dim : op_desc_info.output_shape[j]) { + output.mutable_shape()->add_dim(dim); + } + output.set_size(op_desc_info.output_size[j]); + GELOGI("[Set][DumpData] The output size int exception is %ld", op_desc_info.output_size[j]); + dump_data.mutable_output()->Add(std::move(output)); + } +} +} // namespace + +namespace ge { +ExceptionDumper::~ExceptionDumper() {} + +void ExceptionDumper::SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + vector &input_addrs, vector &output_addrs) { + OpDescInfo op_desc_info; + SaveOpDescInfo(op, task_id, stream_id, op_desc_info); + op_desc_info.input_addrs = input_addrs; + op_desc_info.output_addrs = output_addrs; + op_desc_info_.emplace_back(std::move(op_desc_info)); +} + +void ExceptionDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, + uint32_t task_id, uint32_t stream_id) { + OpDescInfo op_desc_info; + SaveOpDescInfo(op, task_id, stream_id, op_desc_info); + op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op); + op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op); + op_desc_info_.emplace_back(std::move(op_desc_info)); +} + +void ExceptionDumper::SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + OpDescInfo &op_desc_info) { + if (op == nullptr) { + GELOGW("[Save][OpExceptionInfo] op desc ptr is null."); + return; + } + GELOGD("[Save][OpExceptionInfo] Start to save dump op [%s] info of task_id: %u, stream_id: %u", + op->GetName().c_str(), task_id, stream_id); + op_desc_info.op_name = op->GetName(); + op_desc_info.op_type = op->GetType(); + op_desc_info.task_id = task_id; + op_desc_info.stream_id = stream_id; + for (size_t i = 0; i < op->GetAllInputsSize(); ++i) { + GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i); + if (input_tensor_desc == nullptr) { + continue; + } + op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat()); + op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims()); + op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType()); + int64_t input_size = 0; + + if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) { + GELOGW("[Save][OpExceptionInfo] Op [%s] get input size failed.", op->GetName().c_str()); + return; + } + GELOGD("[Save][OpExceptionInfo] Save dump op info, the input size is %ld", input_size); + op_desc_info.input_size.emplace_back(input_size); + } + for (size_t j = 0; j < op->GetOutputsSize(); ++j) { + GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j); + if (output_tensor_desc == nullptr) { + continue; + } + op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat()); + op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims()); + op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType()); + int64_t output_size = 0; + if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) { + GELOGW("[Save][OpExceptionInfo] Op [%s] get output size failed.", op->GetName().c_str()); + return; + } + GELOGD("[Save][OpExceptionInfo] Save dump op info, the output size is %ld.", output_size); + op_desc_info.output_size.emplace_back(output_size); + } +} + +Status ExceptionDumper::DumpExceptionInfo(const std::vector &exception_infos) const { + GELOGI("[Dump][Exception] Start to dump exception info"); + for (const rtExceptionInfo &iter : exception_infos) { + OpDescInfo op_desc_info; + if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) { + toolkit::dumpdata::DumpData dump_data; + SetDumpData(op_desc_info, dump_data); + uint64_t now_time = GetNowTime(); + std::string op_name = op_desc_info.op_name; + std::string op_type = op_desc_info.op_type; + ReplaceStringElem(op_name); + ReplaceStringElem(op_type); + string dump_file_path = + "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time); + GELOGI("[Dump][Exception] The exception dump file path is %s", dump_file_path.c_str()); + + uint64_t proto_size = dump_data.ByteSizeLong(); + std::unique_ptr proto_msg(new (std::nothrow) char[proto_size]); + bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size); + if (!ret || proto_size == 0) { + REPORT_INNER_ERROR("E19999", "Serialize proto to string fail"); + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump data proto serialize failed"); + return PARAM_INVALID; + } + + GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)), + "Failed to dump proto size"); + GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size), + "Failed to dump proto msg"); + if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump exception input failed"); + return PARAM_INVALID; + } + + if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump exception output failed"); + return PARAM_INVALID; + } + GELOGI("[Dump][Exception] Dump exception info SUCCESS"); + } else { + GELOGE(PARAM_INVALID, "[Dump][Exception] Get op desc info failed,task id:%u,stream id:%u", + iter.taskid, iter.streamid); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +bool ExceptionDumper::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + GELOGI("[Get][OpDescInfo] There are %zu op need to dump.", op_desc_info_.size()); + for (size_t index = 0; index < op_desc_info_.size(); ++index) { + OpDescInfo dump_op_info = op_desc_info_.at(index); + if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) { + GELOGI("[Get][OpDescInfo] Find exception op [%s] of task_id: %u, stream_id: %u.", + dump_op_info.op_name.c_str(), task_id, stream_id); + op_desc_info = dump_op_info; + return true; + } + } + return false; +} + +Status ExceptionDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) const { + GELOGI("[Dump][ExceptionInput] Start to dump exception input"); + for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) { + if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed", + i, op_desc_info.op_name.c_str()); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +Status ExceptionDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) const { + GELOGI("[Dump][ExceptionOutput] Start to dump exception output"); + for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) { + if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) != + SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed", + i, op_desc_info.op_name.c_str()); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +OpDescInfo *ExceptionDumper::MutableOpDescInfo(uint32_t task_id, uint32_t stream_id) { + for (OpDescInfo &op_desc_info : op_desc_info_) { + if (op_desc_info.task_id == task_id && op_desc_info.stream_id == stream_id) { + return &op_desc_info; + } + } + return nullptr; +} +} // namespace ge \ No newline at end of file diff --git a/ge/common/dump/exception_dumper.h b/ge/common/dump/exception_dumper.h new file mode 100644 index 00000000..38a3f26e --- /dev/null +++ b/ge/common/dump/exception_dumper.h @@ -0,0 +1,48 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ +#define GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ + +#include + +#include "graph/op_desc.h" +#include "framework/common/ge_types.h" +#include "graph/load/model_manager/task_info/task_info.h" + +namespace ge { +class ExceptionDumper { + public: + ExceptionDumper() = default; + ~ExceptionDumper(); + + void SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + std::vector &input_addrs, std::vector &output_addrs); + void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id); + Status DumpExceptionInfo(const std::vector &exception_infos) const; + bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; + OpDescInfo *MutableOpDescInfo(uint32_t task_id, uint32_t stream_id); + + private: + void SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, OpDescInfo &op_desc_info); + Status DumpExceptionInput(const OpDescInfo &op_desc_info, const std::string &dump_file) const; + Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const std::string &dump_file) const; + + std::vector op_desc_info_; +}; +} // namespace ge + +#endif // GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ diff --git a/ge/executor/CMakeLists.txt b/ge/executor/CMakeLists.txt index 89fce8a0..2fca1aa6 100644 --- a/ge/executor/CMakeLists.txt +++ b/ge/executor/CMakeLists.txt @@ -16,6 +16,7 @@ set(SRC_LIST "../common/ge/plugin_manager.cc" "../common/ge/op_tiling_manager.cc" "../common/dump/dump_properties.cc" + "../common/dump/exception_dumper.cc" "../common/dump/dump_manager.cc" "../common/dump/dump_op.cc" "../common/dump/opdebug_register.cc" diff --git a/ge/graph/load/model_manager/data_dumper.cc b/ge/graph/load/model_manager/data_dumper.cc index 29b64268..f74272a5 100644 --- a/ge/graph/load/model_manager/data_dumper.cc +++ b/ge/graph/load/model_manager/data_dumper.cc @@ -72,24 +72,6 @@ static bool ParseNameIndex(const std::string &node_name_index, std::string &node static bool IsTensorDescWithSkipDumpAddrType(bool has_mem_type_attr, vector v_memory_type, size_t i) { return has_mem_type_attr && (v_memory_type[i] == RT_MEMORY_L1); } - -static uint64_t GetNowTime() { - uint64_t ret = 0; - mmTimeval tv; - if (mmGetTimeOfDay(&tv, nullptr) == 0) { - ret = tv.tv_sec * 1000000ULL + tv.tv_usec; - } - - return ret; -} - -static void ReplaceStringElem(std::string &str) { - for_each(str.begin(), str.end(), [](char &ch) { - if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) { - ch = '_'; - } - }); -} } // namespace static int32_t GetIrDataType(ge::DataType data_type) { @@ -194,66 +176,6 @@ void DataDumper::SaveOpDebugId(uint32_t task_id, uint32_t stream_id, void *op_de is_op_debug_ = is_op_debug; } -void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, - uint32_t stream_id) { - GELOGD("Start SaveDumpOpInfo of task_id: %u, stream_id: %u", task_id, stream_id); - OpDescInfo op_desc_info; - op_desc_info.op_name = op->GetName(); - op_desc_info.op_type = op->GetType(); - op_desc_info.task_id = task_id; - op_desc_info.stream_id = stream_id; - for (size_t i = 0; i < op->GetAllInputsSize(); ++i) { - GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i); - if (input_tensor_desc == nullptr) { - continue; - } - op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat()); - op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims()); - op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType()); - int64_t input_size = 0; - - if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) { - GELOGW("Get input size failed"); - return; - } - GELOGD("Save dump op info, the input size is %ld", input_size); - op_desc_info.input_size.emplace_back(input_size); - } - for (size_t j = 0; j < op->GetOutputsSize(); ++j) { - GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j); - if (output_tensor_desc == nullptr) { - continue; - } - op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat()); - op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims()); - op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType()); - int64_t output_size = 0; - if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) { - GELOGW("Get input size failed"); - return; - } - GELOGD("Save dump op info, the output size is %ld", output_size); - op_desc_info.output_size.emplace_back(output_size); - } - op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op); - op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op); - - op_desc_info_.emplace_back(op_desc_info); -} - -bool DataDumper::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { - GELOGI("There are %zu op need to dump.", op_desc_info_.size()); - for (size_t index = 0; index < op_desc_info_.size(); ++index) { - OpDescInfo dump_op_info = op_desc_info_.at(index); - if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) { - GELOGI("find exception op of task_id: %u, stream_id: %u.", task_id, stream_id); - op_desc_info = dump_op_info; - return true; - } - } - return false; -} - void DataDumper::SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr &op_desc, uintptr_t args) { if (op_desc == nullptr) { @@ -904,98 +826,4 @@ void DataDumper::PrintCheckLog(string &dump_list_key) { } } } - -Status DataDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) { - GELOGI("Start to dump exception input"); - for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) { - if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i); - return PARAM_INVALID; - } - } - return SUCCESS; -} - -Status DataDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) { - GELOGI("Start to dump exception output"); - for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) { - if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) != - SUCCESS) { - GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i); - return PARAM_INVALID; - } - } - return SUCCESS; -} - -Status DataDumper::DumpExceptionInfo(const std::vector exception_infos) { - GELOGI("Start to dump exception info"); - for (const rtExceptionInfo &iter : exception_infos) { - OpDescInfo op_desc_info; - if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) { - toolkit::dumpdata::DumpData dump_data; - dump_data.set_version("2.0"); - dump_data.set_dump_time(GetNowTime()); - dump_data.set_op_name(op_desc_info.op_name); - for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) { - toolkit::dumpdata::OpInput input; - input.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.input_data_type[i]))); - input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i])); - for (auto dim : op_desc_info.input_shape[i]) { - input.mutable_shape()->add_dim(dim); - } - input.set_size(op_desc_info.input_size[i]); - GELOGI("The input size int exception is %ld", op_desc_info.input_size[i]); - dump_data.mutable_input()->Add(std::move(input)); - } - for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) { - toolkit::dumpdata::OpOutput output; - output.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.output_data_type[j]))); - output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j])); - for (auto dim : op_desc_info.output_shape[j]) { - output.mutable_shape()->add_dim(dim); - } - output.set_size(op_desc_info.output_size[j]); - GELOGI("The output size int exception is %ld", op_desc_info.output_size[j]); - dump_data.mutable_output()->Add(std::move(output)); - } - uint64_t now_time = GetNowTime(); - std::string op_name = op_desc_info.op_name; - std::string op_type = op_desc_info.op_type; - ReplaceStringElem(op_name); - ReplaceStringElem(op_type); - string dump_file_path = - "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time); - GELOGI("The exception dump file path is %s", dump_file_path.c_str()); - - uint64_t proto_size = dump_data.ByteSizeLong(); - std::unique_ptr proto_msg(new (std::nothrow) char[proto_size]); - bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size); - if (!ret || proto_size == 0) { - REPORT_INNER_ERROR("E19999", "Serialize proto to string fail"); - GELOGE(PARAM_INVALID, "Dump data proto serialize failed"); - return PARAM_INVALID; - } - - GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)), - "Failed to dump proto size"); - GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size), - "Failed to dump proto msg"); - if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump exception input failed"); - return PARAM_INVALID; - } - - if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump exception output failed"); - return PARAM_INVALID; - } - GELOGI("Dump exception info SUCCESS"); - } else { - GELOGE(PARAM_INVALID, "Get op desc info failed,task id:%u,stream id:%u", iter.taskid, iter.streamid); - return PARAM_INVALID; - } - } - return SUCCESS; -} } // namespace ge diff --git a/ge/graph/load/model_manager/data_dumper.h b/ge/graph/load/model_manager/data_dumper.h index 06b42afd..8af07d86 100755 --- a/ge/graph/load/model_manager/data_dumper.h +++ b/ge/graph/load/model_manager/data_dumper.h @@ -70,8 +70,6 @@ class DataDumper { void SaveDumpInput(const std::shared_ptr &node); - void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id); - // args is device memory stored first output addr void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr &op_desc, uintptr_t args); void SaveEndGraphId(uint32_t task_id, uint32_t stream_id); @@ -87,14 +85,8 @@ class DataDumper { void SetDumpProperties(const DumpProperties &dump_properties) { dump_properties_ = dump_properties; } const DumpProperties &GetDumpProperties() const { return dump_properties_; } - bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; const std::vector &GetAllOpDescInfo() const { return op_desc_info_; } - // Dump exception info - Status DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file); - Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file); - Status DumpExceptionInfo(const std::vector exception_infos); - private: void ReleaseDevMem(void **ptr) noexcept; diff --git a/ge/graph/load/model_manager/davinci_model.cc b/ge/graph/load/model_manager/davinci_model.cc index 2811d0a1..0d4b5b84 100755 --- a/ge/graph/load/model_manager/davinci_model.cc +++ b/ge/graph/load/model_manager/davinci_model.cc @@ -2656,9 +2656,9 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b GE_CHECK_NOTNULL(model_manager); auto exception_infos = model_manager->GetExceptionInfos(); if (exception_infos.size() > 0) { - GE_CHK_STATUS_RET(data_dumper_.DumpExceptionInfo(exception_infos), "Dump exception info failed"); + GE_CHK_STATUS_RET(DumpExceptionInfo(exception_infos), "[Dump][Exception] Dump exception info failed."); } else { - GELOGI("Exception info is null"); + GELOGI("[Dump][Exception] Exception info is null."); } GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed."); return INTERNAL_ERROR; @@ -4352,4 +4352,37 @@ Status DavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callbac listener->SetCallback(callback); return SUCCESS; } + +void DavinciModel::UpdateOpIOAddrs(uint32_t task_id, uint32_t stream_id, const std::vector &io_addrs) { + if (fixed_mem_base_ == reinterpret_cast(mem_base_)) { + GELOGD("[Update][OpIOAddrs] No need to update op input output addr."); + return; + } + + OpDescInfo *op_desc_info = exception_dumper_.MutableOpDescInfo(task_id, stream_id); + if (op_desc_info == nullptr) { + GELOGW("[Update][OpIOAddrs] Find op desc failed, task_id: %u, stream_id: %u.", task_id, stream_id); + return; + } + size_t input_size = op_desc_info->input_addrs.size(); + size_t output_size = op_desc_info->output_addrs.size(); + if (input_size + output_size != io_addrs.size()) { + GELOGW("[Update][OpIOAddrs] Op[%s] input size[%zu] and output size[%zu] is not equal to io addr size[%zu]", + op_desc_info->op_name.c_str(), input_size, output_size, io_addrs.size()); + return; + } + + vector input_addrs; + vector output_addrs; + for (size_t i = 0; i < io_addrs.size(); i++) { + if (i < input_size) { + input_addrs.emplace_back(GetRunAddress(io_addrs[i])); + } else { + output_addrs.emplace_back(GetRunAddress(io_addrs[i])); + } + } + op_desc_info->input_addrs = input_addrs; + op_desc_info->output_addrs = output_addrs; + GELOGD("[Update][OpIOAddrs] Op [%s] update input output addr success.", op_desc_info->op_name.c_str()); +} } // namespace ge diff --git a/ge/graph/load/model_manager/davinci_model.h b/ge/graph/load/model_manager/davinci_model.h index c28ed4d0..ac6169ad 100755 --- a/ge/graph/load/model_manager/davinci_model.h +++ b/ge/graph/load/model_manager/davinci_model.h @@ -29,6 +29,7 @@ #include "common/helper/om_file_helper.h" #include "common/opskernel/ge_task_info.h" #include "common/properties_manager.h" +#include "common/dump/exception_dumper.h" #include "common/dump/opdebug_register.h" #include "common/types.h" #include "framework/common/util.h" @@ -476,13 +477,17 @@ class DavinciModel { Status ReportProfilingData(); void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) { - data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id); + exception_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id); } void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const shared_ptr &op_desc, uintptr_t args) { data_dumper_.SaveDumpTask(task_id, stream_id, op_desc, args); } + Status DumpExceptionInfo(const std::vector &exception_infos) const { + return exception_dumper_.DumpExceptionInfo(exception_infos); + } + void SetKnownShapeGlobalStep(void *global_step) { known_shape_global_step_ = global_step; } @@ -562,8 +567,9 @@ class DavinciModel { const DumpProperties &GetDumpProperties() const { return data_dumper_.GetDumpProperties(); } bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { - return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info); + return exception_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info); } + void UpdateOpIOAddrs(uint32_t task_id, uint32_t stream_id, const std::vector &io_addrs); bool GetRunningFlag() const { return running_flg_; } void SetRunningFlag(bool flag) { running_flg_ = flag; } @@ -1012,6 +1018,7 @@ class DavinciModel { int64_t maxDumpOpNum_; // for data dump DataDumper data_dumper_; + ExceptionDumper exception_dumper_; OpdebugRegister opdebug_register_; uint64_t iterator_count_; bool is_l1_fusion_enable_; diff --git a/ge/graph/load/model_manager/model_manager.cc b/ge/graph/load/model_manager/model_manager.cc index df86291d..6114467c 100755 --- a/ge/graph/load/model_manager/model_manager.cc +++ b/ge/graph/load/model_manager/model_manager.cc @@ -280,6 +280,7 @@ ModelManager::~ModelManager() { model_map_.clear(); model_aicpu_kernel_.clear(); cust_aicpu_so_.clear(); + dump_exception_flag_ = false; GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0))); } @@ -1587,9 +1588,21 @@ Status ModelManager::GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint3 for (const auto &model : model_map_) { auto davinci_model = model.second; if (davinci_model->GetDeviceId() == device_id) { - GELOGI("Start to GetOpDescInfo of device_id: %u.", device_id); + GELOGI("[Get][OpDescInfo] Start to GetOpDescInfo of device_id: %u in davinci model.", device_id); if (davinci_model->GetOpDescInfo(stream_id, task_id, op_desc_info)) { - GELOGI("Find specific node of stream_id: %u, task_id: %u.", stream_id, task_id); + GELOGI("[Get][OpDescInfo] Find specific node of stream_id: %u, task_id: %u in davinci model.", + stream_id, task_id); + return SUCCESS; + } + } + } + for (const auto &model : hybrid_model_map_) { + auto hybrid_model = model.second; + if (hybrid_model->GetDeviceId() == device_id) { + GELOGI("[Get][OpDescInfo] Start to GetOpDescInfo of device_id: %u in hybrid model.", device_id); + if (hybrid_model->GetOpDescInfo(stream_id, task_id, op_desc_info)) { + GELOGI("[Get][OpDescInfo] Find specific node of stream_id: %u, task_id: %u in hybrid model.", + stream_id, task_id); return SUCCESS; } } @@ -1602,6 +1615,7 @@ Status ModelManager::EnableExceptionDump(const std::map &options if (iter != options.end()) { GELOGI("Find option enable_exeception_dump is %s", iter->second.c_str()); if (iter->second == "1") { + dump_exception_flag_ = true; rtError_t rt_ret = rtSetTaskFailCallback(reinterpret_cast(ExceptionCallback)); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtSetTaskFailCallback fail, ret = 0x%X", diff --git a/ge/graph/load/model_manager/model_manager.h b/ge/graph/load/model_manager/model_manager.h index 1d52696a..bf804d32 100755 --- a/ge/graph/load/model_manager/model_manager.h +++ b/ge/graph/load/model_manager/model_manager.h @@ -313,6 +313,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { instance->AddExceptionInfo(*rt_exception_info); } + bool IsDumpExceptionOpen() { return dump_exception_flag_; } private: /// /// @ingroup domi_ome @@ -356,6 +357,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { std::map> cust_aicpu_so_; static DumpProperties dump_properties_; + bool dump_exception_flag_ = false; }; } // namespace ge diff --git a/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc b/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc index de987d86..e2f600b3 100644 --- a/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc +++ b/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc @@ -357,6 +357,7 @@ void KernelExTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) { Status KernelExTaskInfo::UpdateArgs() { GELOGI("KernelExTaskInfo::UpdateArgs in."); davinci_model_->SetTotalIOAddrs(io_addrs_); + davinci_model_->UpdateOpIOAddrs(task_id_, stream_id_, io_addrs_); GELOGI("KernelExTaskInfo::UpdateArgs success."); return SUCCESS; } diff --git a/ge/graph/load/model_manager/task_info/kernel_task_info.cc b/ge/graph/load/model_manager/task_info/kernel_task_info.cc index 4485515a..82c3e286 100755 --- a/ge/graph/load/model_manager/task_info/kernel_task_info.cc +++ b/ge/graph/load/model_manager/task_info/kernel_task_info.cc @@ -523,6 +523,7 @@ Status KernelTaskInfo::UpdateArgs() { return CopyNoncontinuousArgs(io_addr_offset_); } davinci_model_->SetTotalIOAddrs(io_addrs_); + davinci_model_->UpdateOpIOAddrs(task_id_, stream_id_, io_addrs_); } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { return CopyNoncontinuousArgs(sizeof(aicpu::AicpuParamHead)); } diff --git a/ge/hybrid/executor/hybrid_execution_context.cc b/ge/hybrid/executor/hybrid_execution_context.cc index bde30932..f1357285 100644 --- a/ge/hybrid/executor/hybrid_execution_context.cc +++ b/ge/hybrid/executor/hybrid_execution_context.cc @@ -63,5 +63,27 @@ Status GraphExecutionContext::Synchronize(rtStream_t rt_stream) { REPORT_CALL_ERROR("E19999", "invoke rtStreamSynchronize failed, ret = %d", rt_ret); return RT_FAILED; } + +Status GraphExecutionContext::DumpExceptionInfo(const std::vector &exception_infos) { + if (exception_infos.empty()) { + GELOGI("[Dump][ExceptionInfo] Exception info is null."); + return SUCCESS; + } + GELOGI("[Dump][ExceptionInfo] Start to search dynamic op info and to dump."); + if (exception_dumper.DumpExceptionInfo(exception_infos) != SUCCESS) { + GELOGE(FAILED, "[Dump][Exception] Dump dynamic op exception info failed."); + return FAILED; + } + GELOGI("[Dump][ExceptionInfo] Start to search static op info and to dump."); + for (const auto &iter : davinci_model) { + if (iter != nullptr) { + if (iter->DumpExceptionInfo(exception_infos) != SUCCESS) { + GELOGE(FAILED, "[Dump][ExceptionInfo] Dump static op exception info failed."); + return FAILED; + } + } + } + return SUCCESS; +} } // namespace hybrid } // namespace ge \ No newline at end of file diff --git a/ge/hybrid/executor/hybrid_execution_context.h b/ge/hybrid/executor/hybrid_execution_context.h index 54840c6a..67a96e98 100644 --- a/ge/hybrid/executor/hybrid_execution_context.h +++ b/ge/hybrid/executor/hybrid_execution_context.h @@ -23,6 +23,7 @@ #include "common/properties_manager.h" #include "framework/common/debug/ge_log.h" #include "graph/ge_local_context.h" +#include "graph/load/model_manager/davinci_model.h" #include "hybrid/common/npu_memory_allocator.h" #include "hybrid/common/tensor_value.h" #include "hybrid/executor/hybrid_profiler.h" @@ -54,6 +55,7 @@ struct GraphExecutionContext { void SetErrorCode(Status error_code); Status GetStatus() const; Status Synchronize(rtStream_t rt_stream); + Status DumpExceptionInfo(const std::vector &exception_infos); uint64_t session_id = 0; uint64_t context_id = 0; @@ -68,6 +70,8 @@ struct GraphExecutionContext { DumpProperties dump_properties; bool trace_enabled = false; bool dump_enabled = false; + ExceptionDumper exception_dumper; + std::vector> davinci_model; std::atomic_bool is_eos_{false}; long profiling_level = 0; long iteration = 0; diff --git a/ge/hybrid/executor/hybrid_model_async_executor.h b/ge/hybrid/executor/hybrid_model_async_executor.h index d3fd3d2a..c5a6533a 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.h +++ b/ge/hybrid/executor/hybrid_model_async_executor.h @@ -61,6 +61,8 @@ class HybridModelAsyncExecutor { void SetRunningFlag(bool flag) { running_flag_ = flag; } + const GraphExecutionContext * GeContext() { return executor_->GetContext(); } + private: Status InitInputDesc(); diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc index 6addd9b5..ea4e6912 100755 --- a/ge/hybrid/executor/hybrid_model_executor.cc +++ b/ge/hybrid/executor/hybrid_model_executor.cc @@ -18,6 +18,7 @@ #include "graph/ge_context.h" #include "graph/runtime_inference_context.h" #include "graph/utils/tensor_utils.h" +#include "graph/load/model_manager/model_manager.h" #include "common/dump/dump_manager.h" #include "common/profiling/profiling_manager.h" @@ -102,7 +103,17 @@ Status HybridModelExecutor::ExecuteGraphInternal(SubgraphExecutor &executor, } if (!model_->IsSingleOp()) { - HYBRID_CHK_STATUS_RET(executor.Synchronize(), "Failed to sync root graph."); + Status ret = executor.Synchronize(); + if (ret != ge::SUCCESS) { + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + auto exception_infos = model_manager->GetExceptionInfos(); + if (!exception_infos.empty()) { + HYBRID_CHK_STATUS_RET(context_.DumpExceptionInfo(exception_infos), + "[Execute][GraphInternal] Dump exception info failed."); + } + GELOGE(ret, "[Execute][GraphInternal] Synchronize failed."); + } RECORD_MODEL_EXECUTION_EVENT(&context_, "[Synchronize] End"); } diff --git a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc index a5de7c22..b2a77653 100644 --- a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc +++ b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc @@ -4,6 +4,7 @@ #include "common/dump/dump_manager.h" #include "graph/ge_context.h" #include "graph/runtime_inference_context.h" +#include "graph/load/model_manager/model_manager.h" namespace ge { namespace hybrid { @@ -266,6 +267,13 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar ret = stage_executors_[i]->Synchronize(); if (ret != SUCCESS) { + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + auto exception_infos = model_manager->GetExceptionInfos(); + if (!exception_infos.empty()) { + HYBRID_CHK_STATUS_RET(context_.DumpExceptionInfo(exception_infos), + "[Execute][GraphInternal] Dump exception info failed."); + } GELOGE(ret, "[Invoke][Synchronize] failed for [Executor: %zu].", i); REPORT_CALL_ERROR("E19999", "[Executor: %zu] failed to Synchronize result.", i); has_error = true; diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index 24713f96..dcb3f300 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -19,6 +19,7 @@ #include "graph/utils/tensor_utils.h" #include "graph/utils/tensor_adapter.h" #include "graph/debug/ge_attr_define.h" +#include "graph/load/model_manager/model_manager.h" #include "hybrid/node_executor/node_executor.h" #include "hybrid/executor//worker//shape_inference_engine.h" #include "common/dump/dump_op.h" @@ -70,6 +71,7 @@ class NodeDoneCallback { Status PrepareConstInputs(const NodeItem &node_item); Status DumpDynamicNode(); Status ProfilingReport(); + Status SaveDumpOpInfo(); Status GetTaskDescInfo(const NodePtr node, const HybridModel *model, std::vector &task_desc_info); GraphExecutionContext *graph_context_; @@ -266,6 +268,40 @@ Status NodeDoneCallback::DumpDynamicNode() { return SUCCESS; } +Status NodeDoneCallback::SaveDumpOpInfo() { + GE_CHECK_NOTNULL(graph_context_); + GE_CHECK_NOTNULL(graph_context_->model); + + auto node = context_->GetNodeItem().node; + if (node == nullptr) { + GELOGE(PARAM_INVALID, "[Save][DumpOpInfo] Get node is nullptr."); + return PARAM_INVALID; + } + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + + vector input_addrs; + vector output_addrs; + for (int i = 0; i < context_->NumInputs(); i++) { + auto tensor_value = context_->GetInput(i); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Save][DumpOpInfo] Tensor value is nullptr."); + void *input_addr = const_cast(tensor_value->GetData()); + input_addrs.emplace_back(input_addr); + } + for (int j = 0; j < context_->NumOutputs(); j++) { + auto tensor_value = context_->GetOutput(j); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Save][DumpOpInfo] Tensor value is nullptr."); + void *output_addr = const_cast(tensor_value->GetData()); + output_addrs.emplace_back(output_addr); + } + + uint32_t stream_id = context_->GetStreamId(); + uint32_t task_id = context_->GetTaskId(); + graph_context_->exception_dumper.SaveDumpOpInfo(op_desc, task_id, stream_id, input_addrs, output_addrs); + + return SUCCESS; +} + Status NodeDoneCallback::OnNodeDone() { auto &node_item = context_->GetNodeItem(); GELOGI("[%s] Start callback process.", node_item.NodeName().c_str()); @@ -278,6 +314,12 @@ Status NodeDoneCallback::OnNodeDone() { GE_CHK_STATUS_RET(DumpDynamicNode(), "[Call][DumpDynamicNode] Failed."); } + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + if (model_manager->IsDumpExceptionOpen()) { + GE_CHK_STATUS_RET(SaveDumpOpInfo(), "[Save][DumpOpInfo] Failed to dump op info."); + } + if (ProfilingManager::Instance().ProfilingModelExecuteOn()) { GE_CHK_STATUS_RET(ProfilingReport(), "[Report][Profiling] of node[%s] failed.", node_item.NodeName().c_str()); } diff --git a/ge/hybrid/hybrid_davinci_model.cc b/ge/hybrid/hybrid_davinci_model.cc index 58432031..0ad1c865 100755 --- a/ge/hybrid/hybrid_davinci_model.cc +++ b/ge/hybrid/hybrid_davinci_model.cc @@ -82,6 +82,12 @@ class HybridDavinciModel::Impl { model_.SetOmName(model_name); } + uint32_t GetDeviceId() { + return model_.GetDeviceId(); + } + + const GraphExecutionContext * GeContext() { return executor_.GeContext(); } + uint64_t GetSessionId() { return model_.GetSessionId(); } @@ -199,6 +205,11 @@ void HybridDavinciModel::SetOmName(const string &om_name) { } } +uint32_t HybridDavinciModel::GetDeviceId() const { + GE_CHECK_NOTNULL(impl_); + return impl_->GetDeviceId(); +} + Status HybridDavinciModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { GE_CHECK_NOTNULL(impl_); return impl_->GetDynamicBatchInfo(batch_info, dynamic_type); @@ -245,5 +256,22 @@ bool HybridDavinciModel::GetRunningFlag() const { return impl_->GetRunningFlag() Status HybridDavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { return impl_->SetRunAsyncListenerCallback(callback); } + +bool HybridDavinciModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + if (impl_ == nullptr) { + return false; + } + auto context = impl_->GeContext(); + GE_CHECK_NOTNULL(context); + bool ret = context->exception_dumper.GetOpDescInfo(stream_id, task_id, op_desc_info); + if (!ret) { + for (const auto &iter : context->davinci_model) { + if (iter->GetOpDescInfo(stream_id, task_id, op_desc_info)) { + return true; + } + } + } + return ret; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/hybrid_davinci_model.h b/ge/hybrid/hybrid_davinci_model.h index 449dd73e..472fff17 100644 --- a/ge/hybrid/hybrid_davinci_model.h +++ b/ge/hybrid/hybrid_davinci_model.h @@ -61,6 +61,8 @@ class HybridDavinciModel { uint64_t GetSessionId(); + uint32_t GetDeviceId() const; + Status GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type); void GetUserDesignateShapeOrder(std::vector &user_input_shape_order); @@ -80,6 +82,8 @@ class HybridDavinciModel { Status SetRunAsyncListenerCallback(const RunAsyncCallback &callback); + bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; + private: HybridDavinciModel() = default; class Impl; diff --git a/ge/hybrid/hybrid_davinci_model_stub.cc b/ge/hybrid/hybrid_davinci_model_stub.cc index f30fe5cc..2d4fbe03 100644 --- a/ge/hybrid/hybrid_davinci_model_stub.cc +++ b/ge/hybrid/hybrid_davinci_model_stub.cc @@ -72,6 +72,10 @@ uint32_t HybridDavinciModel::GetDataInputerSize() { return 0; } +uint32_t HybridDavinciModel::GetDeviceId() const { + return 0; +} + Status HybridDavinciModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { return UNSUPPORTED; } @@ -99,5 +103,9 @@ bool HybridDavinciModel::GetRunningFlag() const { Status HybridDavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { return UNSUPPORTED; } + +bool HybridDavinciModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + return true; +} } // namespace hybrid } // namespace ge \ No newline at end of file diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc index a4fc4449..29ae831c 100755 --- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc +++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc @@ -208,6 +208,8 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function REPORT_CALL_ERROR("E19999", "rtGetTaskIdAndStreamID failed, ret: 0x%X.", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } + context.SetTaskId(task_id); + context.SetStreamId(stream_id); GELOGD("Aicore node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id); (void)context.SaveProfilingTaskDescInfo(task_id, stream_id, kTaskTypeAicore, (*it)->GetBlockDim()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End"); diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc index 339e1ee4..c2ebf654 100755 --- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc +++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc @@ -208,6 +208,8 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::functionSubModelId()), "[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u", davinci_model_->GetSessionId(), davinci_model_->Id(), davinci_model_->SubModelId()); + if (!load_flag_) { + auto execution_context = const_cast(context.GetExecutionContext()); + GE_CHECK_NOTNULL(execution_context); + auto &davinci_model = execution_context->davinci_model; + davinci_model.emplace_back(davinci_model_); + load_flag_ = true; + } + GELOGI("[%s] KnownNodeExecutor::Init success.", context.GetNodeName()); return SUCCESS; } diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h index 26141b5a..629cb543 100644 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h @@ -42,6 +42,7 @@ class KnownNodeTask : public NodeTask { virtual Status DoInitDavinciModel(void *weight, size_t weight_size); private: std::shared_ptr davinci_model_ = nullptr; + bool load_flag_ = false; }; class KnownNodeExecutor : public NodeExecutor { diff --git a/parser b/parser index df9abef6..424ac060 160000 --- a/parser +++ b/parser @@ -1 +1 @@ -Subproject commit df9abef65f902f37ca664f6dda4c60727dac2aca +Subproject commit 424ac0609fe17f455865436462a2c62f85aea2b1 diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt index 07b10dac..dabc1485 100755 --- a/tests/ut/ge/CMakeLists.txt +++ b/tests/ut/ge/CMakeLists.txt @@ -166,6 +166,7 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/common/dump/dump_properties.cc" "${GE_CODE_DIR}/ge/common/helper/model_helper.cc" "${GE_CODE_DIR}/ge/common/dump/dump_manager.cc" + "${GE_CODE_DIR}/ge/common/dump/exception_dumper.cc" "${GE_CODE_DIR}/ge/common/dump/opdebug_register.cc" "${GE_CODE_DIR}/ge/common/dump/dump_op.cc" "${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc" @@ -756,6 +757,7 @@ set(MULTI_PARTS_TEST_FILES "common/datatype_transfer_unittest.cc" "common/dump_manager_unittest.cc" "common/dump_op_unittest.cc" + "common/dump_exception_unittest.cc" "common/opdebug_register_unittest.cc" "common/format_transfer_unittest.cc" "common/format_transfer_transpose_unittest.cc" diff --git a/tests/ut/ge/common/dump_exception_unittest.cc b/tests/ut/ge/common/dump_exception_unittest.cc new file mode 100644 index 00000000..339d532e --- /dev/null +++ b/tests/ut/ge/common/dump_exception_unittest.cc @@ -0,0 +1,54 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define protected public +#define private public +#include "common/dump/exception_dumper.h" +#include "common/debug/log.h" +#include "common/ge_inner_error_codes.h" +#undef private +#undef protected + +namespace ge { +class UTEST_dump_exception : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +TEST_F(UTEST_dump_exception, save_dump_op_info_success) { + OpDescPtr op_desc = std::make_shared("GatherV2", "GatherV2"); + uint32_t task_id = 1; + uint32_t stream_id = 233; + vector input_addr; + vector output_addr; + ExceptionDumper exception_dumper; + exception_dumper.SaveDumpOpInfo(op_desc, task_id, stream_id, input_addr, output_addr); +} + +TEST_F(UTEST_dump_exception, dump_exception_info) { + rtExceptionInfo exception_info = {1, 2, 3, 4, 5}; + std::vector exception_infos = { exception_info }; + OpDescInfo op_desc_info = {"Save", "Save", 1, 2, {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {}, {2}, + {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {}, {2}}; + + ExceptionDumper exception_dumper; + exception_dumper.op_desc_info_ = { op_desc_info }; + exception_dumper.DumpExceptionInfo(exception_infos); +} +} // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/graph/load/davinci_model_unittest.cc b/tests/ut/ge/graph/load/davinci_model_unittest.cc index 0cf0f5cb..56a91ef8 100644 --- a/tests/ut/ge/graph/load/davinci_model_unittest.cc +++ b/tests/ut/ge/graph/load/davinci_model_unittest.cc @@ -1034,4 +1034,16 @@ TEST_F(UtestDavinciModel, NnExecute) { model.task_list_.resize(1); EXPECT_EQ(model.NnExecute(stream, false, input_data, output_data), SUCCESS); } +TEST_F(UtestDavinciModel, update_io_addr_success) { + DavinciModel model(0, nullptr); + uint32_t task_id = 1; + uint32_t stream_id = 2; + model.fixed_mem_base_ = 0x22; + model.mem_base_ = reinterpret_cast(&task_id); + OpDescInfo op_desc_info = {"Save", "Save", 1, 2, {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {nullptr}, {2}, + {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {nullptr}, {2}}; + model.exception_dumper_.op_desc_info_ = { op_desc_info }; + vector io_addr = {nullptr, nullptr}; + model.UpdateOpIOAddrs(task_id, stream_id, io_addr); +} } // namespace ge