Browse Source

!813 Eliminate output_op_list_

From: @zhangxiaokun9
Reviewed-by: @xchu42,@wqtshg,@ji_chen
Signed-off-by: @ji_chen
tags/v1.2.0
mindspore-ci-bot Gitee 3 years ago
parent
commit
7627fd2365
4 changed files with 583 additions and 262 deletions
  1. +184
    -171
      ge/graph/load/new_model_manager/davinci_model.cc
  2. +113
    -91
      ge/graph/load/new_model_manager/davinci_model.h
  3. +1
    -0
      tests/ut/ge/CMakeLists.txt
  4. +285
    -0
      tests/ut/ge/graph/load/davinci_model_unittest.cc

+ 184
- 171
ge/graph/load/new_model_manager/davinci_model.cc View File

@@ -163,7 +163,6 @@ DavinciModel::~DavinciModel() {

op_list_.clear();
data_op_list_.clear();
output_op_list_.clear();
tensor_name_to_fixed_addr_size_.clear();
tensor_name_to_peer_output_index_.clear();
GE_DELETE_NEW_SINGLE(data_inputer_);
@@ -830,12 +829,11 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
{CASE, &DavinciModel::InitCase},
};

GE_CHK_STATUS_RET(InitInputOutputForDynamic(compute_graph), "InitInputOutputForDynamic failed.");

vector<OpDescPtr> output_op_list;
map<uint32_t, OpDescPtr> data_by_index;
auto nodes = compute_graph->GetAllNodes();
const CustAICPUKernelStore &aicpu_kernel_store = ge_model_->GetCustAICPUKernelStore();
for (size_t i = 0; i < nodes.size(); i++) {
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes.at(i);
auto op_desc = node->GetOpDesc();
if (op_desc == nullptr) {
@@ -850,7 +848,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
GE_TIMESTAMP_ADD(LoadTBEKernelBinToOpDesc);

if (IsDataOp(op_desc->GetType())) {
if (InitDataOp(node, data_op_index, data_by_index) != SUCCESS) {
if (InitDataOp(compute_graph, node, data_op_index, data_by_index) != SUCCESS) {
GELOGE(PARAM_INVALID, "Data init failed, Name: %s", op_desc->GetName().c_str());
return PARAM_INVALID;
}
@@ -859,7 +857,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
}

if (op_desc->GetType() == NETOUTPUT) {
if (InitNetOutput(node) != SUCCESS) {
if (InitNetOutput(compute_graph, node, output_op_list) != SUCCESS) {
GELOGE(PARAM_INVALID, "NetOutput init failed, Name: %s", op_desc->GetName().c_str());
return PARAM_INVALID;
}
@@ -919,33 +917,10 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
}
GE_TIMESTAMP_ADD(InitTbeHandle);
}
AdjustDataOpList(data_by_index);
GE_TIMESTAMP_CALLNUM_END(LoadTBEKernelBinToOpDesc, "GraphLoader::LoadTBEKernelBinToOpDesc.");
GE_TIMESTAMP_CALLNUM_END(InitTbeHandle, "GraphLoader::InitTbeHandle.");
return SUCCESS;
}

Status DavinciModel::InitInputOutputForDynamic(const ComputeGraphPtr &compute_graph) {
if (!known_node_) return SUCCESS;
// for dynamic shape
auto direct_nodes = compute_graph->GetDirectNode();
for (size_t i = 0; i < direct_nodes.size(); i++) {
auto node = direct_nodes.at(i);
auto op_desc = node->GetOpDesc();
if (op_desc == nullptr) {
GELOGE(PARAM_INVALID, "op_desc is null.");
return PARAM_INVALID;
}
if (IsDataOp(op_desc->GetType())) {
GELOGD("init data op %s", op_desc->GetName().c_str());
data_op_list_.push_back(op_desc);
}
if (op_desc->GetType() == NETOUTPUT) {
GELOGD("init netouput op %s", op_desc->GetName().c_str());
output_op_list_.push_back(op_desc);
}
}
return SUCCESS;
return OptInputOutputInfo(data_by_index, output_op_list);
}

void DavinciModel::SetLabelForDynamic(const NodePtr &node) {
@@ -963,24 +938,35 @@ void DavinciModel::SetLabelForDynamic(const NodePtr &node) {
}
}

///
/// @ingroup ge
/// @brief Data Op Initialize.
/// @param [in] ComputeGraphPtr: root graph of the model.
/// @param [in] NodePtr: Data Op.
/// @param [in/out] data_op_index: NetOutput addr size info.
/// @param [in/out] data_op_index: index of courrent count.
/// @param [in/out] data_by_index: Data ordered by index.
/// @return Status
Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, map<uint32_t, OpDescPtr> &data_by_index) {
///
Status DavinciModel::InitDataOp(const ComputeGraphPtr &graph, const NodePtr &node, uint32_t &data_op_index,
map<uint32_t, OpDescPtr> &data_by_index) {
// op_desc Checked by Init: Data, valid.
auto op_desc = node->GetOpDesc();
if (known_node_) {
if (node->GetOwnerComputeGraph() != graph) {
GELOGI("Skip subgraph Data node: %s.", op_desc->GetName().c_str());
return SUCCESS;
}
uint32_t parent_index = 0; // Ignore subgraph Data Node.
if (AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
GELOGI("Init zero copy by subgraph Data node: %s.", op_desc->GetName().c_str());
return SUCCESS;

GELOGI("Init Data node: %s.", op_desc->GetName().c_str());
auto data_index = data_op_index++;
if (AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
GELOGD("Get new index %u, old %u", data_index, data_op_index - 1);
}

data_by_index[data_index] = op_desc;
data_op_list_.push_back(op_desc);
if (known_node_) {
return SUCCESS;
}

// Make information for copy input data.
const vector<int64_t> output_size_list = ModelUtils::GetOutputSize(op_desc);
@@ -992,10 +978,7 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
op_desc->GetName().c_str(), output_size_list.size(), virtual_addr_list.size(), output_offset_list.size());
return PARAM_INVALID;
}
auto data_index = data_op_index;
if (AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
GELOGD("ge_train: get new index %u, old %u", data_index, data_op_index);
}

bool fusion_flag = false;
ZeroCopyOffset zero_copy_offset;
int64_t data_size = output_size_list[kDataIndex];
@@ -1006,7 +989,6 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
return PARAM_INVALID;
}
new_input_data_info_[data_index] = zero_copy_offset;
data_by_index[data_index] = op_desc;

for (size_t index = 0; index < virtual_addr_list.size(); ++index) {
void *addr = virtual_addr_list.at(index);
@@ -1017,7 +999,6 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
new_input_outside_addrs_[addr] = zero_copy_offset;
}

data_op_index++;
return SUCCESS;
}

@@ -1025,18 +1006,52 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
/// @ingroup ge
/// @brief Sort Data op list by index.
/// @param [in] data_by_index: map of Data Op.
/// @return
/// @param [in] output_op_list: list of NetOutput op.
/// @return Status
///
void DavinciModel::AdjustDataOpList(const map<uint32_t, OpDescPtr> &data_by_index) {
Status DavinciModel::OptInputOutputInfo(const map<uint32_t, OpDescPtr> &data_by_index,
const vector<OpDescPtr> &output_op_list) {
GELOGD("Data node size: %zu, NetOutput node size: %zu", data_op_list_.size(), output_op_list.size());
if (data_by_index.size() != data_op_list_.size()) {
GELOGW("Data map size: %zu, Data list size: %zu.", data_by_index.size(), data_op_list_.size());
return;
GELOGE(INTERNAL_ERROR, "Data map size: %zu, Data list size: %zu.", data_by_index.size(), data_op_list_.size());
return INTERNAL_ERROR;
}

data_op_list_.clear();
for (auto &item : data_by_index) {
data_op_list_.emplace_back(item.second);
auto output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, item.second);
GELOGD("Data node: %s, output addr size: %zu", item.second->GetName().c_str(), output_addrs.size());
input_addrs_list_.emplace_back(output_addrs);

if (item.second->GetType() == AIPP_DATA_TYPE) {
GELOGI("This is dynamic aipp model, Node: %s", item.second->GetName().c_str());
is_dynamic_aipp_ = true;
}
}

for (const auto &op_desc : output_op_list) {
auto input_addrs = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc);
GELOGD("NetOutput node: %s, input addr size: %zu", op_desc->GetName().c_str(), input_addrs.size());
output_addrs_list_.emplace_back(input_addrs);

bool getnext_sink_dynamic = false;
if (AttrUtils::GetBool(op_desc, ATTR_GETNEXT_SINK_DYNMAIC, getnext_sink_dynamic) && getnext_sink_dynamic) {
GELOGI("ATTR_GETNEXT_SINK_DYNMAIC has been set and is true, node: %s", op_desc->GetName().c_str());
is_getnext_sink_dynamic_ = true;
}

vector<string> shape_info;
if (AttrUtils::GetListStr(op_desc, ATTR_NAME_DYNAMIC_OUTPUT_DIMS, shape_info)) {
dynamic_output_shape_info_.insert(dynamic_output_shape_info_.end(), shape_info.begin(), shape_info.end());
}

if (InitOutputTensorInfo(op_desc) != SUCCESS) {
return INTERNAL_ERROR;
}
}

return InitOutputDescInfo(output_op_list, output_descs_, output_formats_);
}

bool DavinciModel::IsGetNextSinkDynamic(const OpDescPtr &op_desc) {
@@ -1050,24 +1065,27 @@ bool DavinciModel::IsGetNextSinkDynamic(const OpDescPtr &op_desc) {

/// @ingroup ge
/// @brief NetOutput Op Initialize.
/// @param [in] ComputeGraphPtr: root graph of the model.
/// @param [in] NodePtr: NetOutput Op.
/// @param [in/out] vector<OpDescPtr>: All NetOutput node in model.
/// @return Status
Status DavinciModel::InitNetOutput(const NodePtr &node) {
Status DavinciModel::InitNetOutput(const ComputeGraphPtr &graph, const NodePtr &node,
vector<OpDescPtr> &output_op_list) {
// node->GetOpDesc Checked by Init: NetOutput, valid.
auto op_desc = node->GetOpDesc();
// excludes the function op sub graph, e.g. case,if
if (known_node_) {
if (node->GetOwnerComputeGraph() != graph) {
GELOGI("Skip subgraph NetOutput node: %s.", op_desc->GetName().c_str());
op_list_.erase(op_desc->GetId());
return SUCCESS;
}
ComputeGraphPtr owner_graph = node->GetOwnerComputeGraph();
GE_CHECK_NOTNULL(owner_graph);
if (owner_graph->GetParentGraph() != nullptr) {
GELOGI("Init zero copy by subgraph NetOutput node: %s.", op_desc->GetName().c_str());
op_list_.erase(op_desc->GetId());

GELOGI("Init NetOutput node: %s.", op_desc->GetName().c_str());
output_op_list.push_back(op_desc);
if (known_node_) {
return SUCCESS;
}

output_op_list_.push_back(op_desc);
// Make information for copy output data.
const vector<int64_t> input_size_list = ModelUtils::GetInputSize(op_desc);
const vector<void *> virtual_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc);
@@ -1665,32 +1683,30 @@ Status DavinciModel::CpuModelRepeat() {

Status DavinciModel::GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc,
vector<InputOutputDescInfo> &output_desc) {
if ((data_op_list_.empty()) || (data_op_list_[0]->GetInputsSize()) != 1) {
if (input_addrs_list_.empty() || input_addrs_list_[0].size() != 1) {
GELOGI("data_op_list_ is empty or input_desc size is not 1.");
} else {
std::vector<uint32_t> input_formats;
vector<uint32_t> input_formats;
GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed.");
}

std::vector<uint32_t> outputFormats;
GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get output desc info failed.");

vector<uint32_t> output_formats;
GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get output desc info failed");
return SUCCESS;
}

Status DavinciModel::GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc,
vector<InputOutputDescInfo> &output_desc,
std::vector<uint32_t> &input_formats,
std::vector<uint32_t> &outputFormats) {
if ((data_op_list_.empty()) || (data_op_list_[0]->GetInputsSize()) != 1) {
vector<uint32_t> &input_formats,
vector<uint32_t> &output_formats) {
if (input_addrs_list_.empty() || input_addrs_list_[0].size() != 1) {
GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1!");
return FAILED;
}

GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed");

GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get ouput desc info failed");

GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get output desc info failed");
return SUCCESS;
}

@@ -1828,29 +1844,22 @@ void DavinciModel::GetCurShape(std::vector<int64_t> &batch_info, int32_t &dynami
dynamic_type = dynamic_type_;
}

void DavinciModel::GetModelAttr(std::vector<std::string> &dynamic_output_shape_info) {
for (auto &op : output_op_list_) {
if (op->GetType() != NETOUTPUT) {
continue;
}
if (!AttrUtils::GetListStr(op, ATTR_NAME_DYNAMIC_OUTPUT_DIMS, dynamic_output_shape_info)) {
GELOGD("Can not get dynamic output dims attr");
}
}
void DavinciModel::GetModelAttr(vector<string> &out_shape_info) {
out_shape_info.insert(out_shape_info.end(), dynamic_output_shape_info_.begin(), dynamic_output_shape_info_.end());
}

Status DavinciModel::GetInputOutputDescInfoForZeroCopy(vector<InputOutputDescInfo> &input_desc,
vector<InputOutputDescInfo> &output_desc,
std::vector<uint32_t> &input_formats,
std::vector<uint32_t> &outputFormats) {
if ((data_op_list_.empty()) || (1 != data_op_list_[0]->GetInputsSize())) {
std::vector<uint32_t> &output_formats) {
if (input_addrs_list_.empty() || input_addrs_list_[0].size() != kOutputNum) {
GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1!");
return FAILED;
}

GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed");

GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get ouput desc info failed");
GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get ouput desc info failed");

GE_CHK_BOOL_RET_STATUS(output_desc.size() == output_memory_size_list_.size(), INTERNAL_ERROR,
"output_desc size[%zu] not equal output_size_list_[%zu] size!", output_desc.size(),
@@ -1939,7 +1948,7 @@ Status DavinciModel::GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, s
return SUCCESS;
}

void DavinciModel::CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputDescInfo &output,
void DavinciModel::CreateOutput(uint32_t index, const OpDescPtr &op_desc, InputOutputDescInfo &output,
uint32_t &format_result) {
/// netoutput input tensor desc
GE_IF_BOOL_EXEC(op_desc->GetInputDescPtr(index) == nullptr, GELOGE(FAILED, "OpDesc GetInputDescPtr is nullptr");
@@ -1992,10 +2001,10 @@ void DavinciModel::CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputD
output.data_type = op_desc->GetInputDescPtr(index)->GetDataType();
}

Status DavinciModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats) {
GELOGD("Output node size: %zu", output_op_list_.size());
for (size_t i = 0; i < output_op_list_.size(); i++) {
auto &op_desc = output_op_list_[i];
Status DavinciModel::InitOutputDescInfo(const vector<OpDescPtr> &output_op_list,
vector<InputOutputDescInfo> &output_descs, vector<uint32_t> &output_formats) {
GELOGD("Output node size: %zu", output_op_list.size());
for (const auto &op_desc : output_op_list) {
uint32_t out_size = static_cast<uint32_t>(op_desc->GetInputsSize());
for (uint32_t index = 0; index < out_size; index++) {
string output_name;
@@ -2018,13 +2027,19 @@ Status DavinciModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
std::to_string(src_index[index]);
}
output.name = output_name;
output_desc.push_back(output);
formats.push_back(format_result);
output_descs.push_back(output);
output_formats.push_back(format_result);
}
}
return SUCCESS;
}

Status DavinciModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_descs, vector<uint32_t> &output_formats) {
output_descs.insert(output_descs.end(), output_descs_.begin(), output_descs_.end());
output_formats.insert(output_formats.end(), output_formats_.begin(), output_formats_.end());
return SUCCESS;
}

ge::Format DavinciModel::GetFormat() {
if ((data_op_list_.empty()) || data_op_list_[0] == nullptr || data_op_list_[0]->GetInputDescPtr(0) == nullptr) {
GELOGW("OP List Pointer is null or input_desc size is not 1!");
@@ -2362,7 +2377,7 @@ void DavinciModel::SetProfileTime(ModelProcStage stage, int64_t endTime) {
/// @author
///
Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, rtMemcpyKind_t kind) {
if (output_op_list_.empty()) {
if (output_addrs_list_.empty()) {
Status ret = SyncVarData();
return ret;
}
@@ -2421,20 +2436,12 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r
return SUCCESS;
}

Status DavinciModel::GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data_index, OutputData *output_data,
std::vector<ge::OutputTensorInfo> &outputs) {
GE_CHECK_NOTNULL(op_desc);
GE_CHECK_NOTNULL(output_data);
if (output_data->blobs.size() > data_index) {
GELOGI("No need to generate output tensor info, model id:%u", model_id_);
return SUCCESS;
}
std::vector<int64_t> out_buffer_size_vec;
std::vector<std::vector<int64_t>> shape_info_vec;
Status DavinciModel::InitOutputTensorInfo(const OpDescPtr &op_desc) {
size_t input_num = op_desc->GetInputsSize();
if (is_getnext_sink_dynamic_) {
input_num = input_num - kGetDynamicDimsCount;
}

for (size_t i = 0; i < input_num; ++i) {
int64_t size = 0;
auto input_desc = op_desc->GetInputDescPtr(i);
@@ -2454,25 +2461,37 @@ Status DavinciModel::GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data
}
}
GELOGI("Output size is %ld, output shape is %s.", size, formats::JoinToString(output_shape).c_str());
out_buffer_size_vec.push_back(size);
shape_info_vec.push_back(output_shape);
output_buffer_size_.push_back(size);
output_shape_info_.push_back(output_shape);
}

return SUCCESS;
}

Status DavinciModel::GenOutputTensorInfo(OutputData *output_data, vector<OutputTensorInfo> &outputs) {
GE_CHECK_NOTNULL(output_data);
if (!output_data->blobs.empty()) {
GELOGI("No need to generate output tensor info, model id:%u", model_id_);
return SUCCESS;
}
GELOGI("Output blobs size:%zu, data index:%u, model id:%u", out_buffer_size_vec.size(), data_index, model_id_);
for (size_t i = 0; i < out_buffer_size_vec.size(); ++i) {
std::unique_ptr<uint8_t[]> data_buf(new (std::nothrow) uint8_t[out_buffer_size_vec[i]]);

GELOGI("Output blobs size:%zu, model id:%u", output_buffer_size_.size(), model_id_);
for (size_t i = 0; i < output_buffer_size_.size(); ++i) {
std::unique_ptr<uint8_t[]> data_buf(new (std::nothrow) uint8_t[output_buffer_size_[i]]);
if (data_buf == nullptr) {
GELOGE(GE_GRAPH_MALLOC_FAILED, "Malloc buffer failed.");
return GE_GRAPH_MALLOC_FAILED;
}
output_data->blobs.push_back({data_buf.get(), static_cast<uint64_t>(out_buffer_size_vec[i]), false});
output_data->blobs.push_back({data_buf.get(), static_cast<uint64_t>(output_buffer_size_[i]), false});
ge::OutputTensorInfo output;
output.dims = shape_info_vec[i];
output.dims = output_shape_info_[i];
output.data = std::move(data_buf);
output.length = out_buffer_size_vec[i];
output.length = output_buffer_size_[i];
outputs.emplace_back(std::move(output));
GELOGD("Output index:%zu, output dims is %s, data length:%lu.", i,
formats::JoinToString(output.dims).c_str(), output.length);
}

return SUCCESS;
}

@@ -2507,36 +2526,28 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
return INTERNAL_ERROR;
}

if (output_op_list_.empty()) {
if (output_addrs_list_.empty()) {
GELOGW("Output tensor list is empty, model id: %u", model_id_);
GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed.");
return INTERNAL_ERROR;
}

GE_CHECK_NOTNULL(output_data);
// index of data in output_data
uint32_t data_index = 0;

output_data->index = data_id;
output_data->model_id = model_id_;

is_getnext_sink_dynamic_ = false;
// copy output data from op to designated position
for (auto &op_desc : output_op_list_) {
if (IsGetNextSinkDynamic(op_desc)) {
GELOGD("Reinit cur dynamic dims when getnext sink dynamic.");
is_getnext_sink_dynamic_ = true;
cur_dynamic_dims_.clear();
cur_dynamic_dims_.resize(shape_of_cur_dynamic_dims_);
auto ret = rtMemcpy(cur_dynamic_dims_.data(), shape_of_cur_dynamic_dims_ * sizeof(int64_t),
netoutput_last_input_addr_, netoutput_last_input_size_, RT_MEMCPY_DEVICE_TO_HOST);
GE_CHK_RT_RET(ret);
}
GELOGD("Cur dynamic dims is %s.", formats::JoinToString(cur_dynamic_dims_).c_str());
if (GenOutputTensorInfo(op_desc, data_index, output_data, outputs) != SUCCESS) {
return INTERNAL_ERROR;
}
data_index += op_desc->GetInputsSize();
if (is_getnext_sink_dynamic_) {
GELOGD("Reinit cur dynamic dims when getnext sink dynamic.");
cur_dynamic_dims_.clear();
cur_dynamic_dims_.resize(shape_of_cur_dynamic_dims_);
auto ret = rtMemcpy(cur_dynamic_dims_.data(), shape_of_cur_dynamic_dims_ * sizeof(int64_t),
netoutput_last_input_addr_, netoutput_last_input_size_, RT_MEMCPY_DEVICE_TO_HOST);
GE_CHK_RT_RET(ret);
}

GELOGD("Cur dynamic dims is %s.", formats::JoinToString(cur_dynamic_dims_).c_str());
if (GenOutputTensorInfo(output_data, outputs) != SUCCESS) {
return INTERNAL_ERROR;
}

if (CopyOutputData(data_id, *output_data, RT_MEMCPY_DEVICE_TO_HOST) != SUCCESS) {
@@ -2668,10 +2679,10 @@ void *DavinciModel::Run(DavinciModel *model) {
model->SetProfileTime(MODEL_AFTER_PROC_START));
GE_TIMESTAMP_START(ReturnResult3);
// copy output data from device to host
GE_IF_BOOL_EXEC(!model->output_op_list_.empty(),
GE_IF_BOOL_EXEC(!model->output_addrs_list_.empty(),
(void)model->ReturnResult(current_data.index, rslt_flg, false, data_wrapper->GetOutput()))
// copy output data from device to host for variable graph
GE_IF_BOOL_EXEC(model->output_op_list_.empty(), (void)model->ReturnNoOutput(current_data.index));
GE_IF_BOOL_EXEC(model->output_addrs_list_.empty(), (void)model->ReturnNoOutput(current_data.index));
GE_IF_BOOL_EXEC(model->is_first_execute_,
GE_TIMESTAMP_EVENT_END(ReturnResult3, "GraphExcute::CopyDataFromDeviceToHost"));
GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(),
@@ -2791,30 +2802,49 @@ void DavinciModel::UnbindTaskSinkStream() {
}
}

void *DavinciModel::GetRunAddress(void *addr) const {
if (fixed_mem_base_ == reinterpret_cast<uintptr_t>(mem_base_)) {
return addr;
}

uintptr_t ptr = reinterpret_cast<uintptr_t>(addr);
if ((fixed_mem_base_ <= ptr) && (ptr < fixed_mem_base_ + runtime_param_.mem_size)) {
return mem_base_ + (ptr - fixed_mem_base_);
} else {
return addr;
}
}

Status DavinciModel::CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs) {
GELOGI("DavinciModel::CreateKnownZeroCopyMap in.");
if (inputs.size() > data_op_list_.size()) {
GELOGE(FAILED, "input data addr %zu should less than input op number %zu.", inputs.size(), data_op_list_.size());
GELOGI("in, inputs size: %zu, input addr size: %zu, outputs size: %zu, output addr size: %zu",
inputs.size(), input_addrs_list_.size(), outputs.size(), output_addrs_list_.size());
if (inputs.size() > input_addrs_list_.size()) {
GELOGE(FAILED, "input data addr %zu should less than input op num %zu.", inputs.size(), input_addrs_list_.size());
return FAILED;
}
// remove zero copy addr in last iteration
knonw_input_data_info_.clear();
knonw_output_data_info_.clear();
known_input_data_info_.clear();
known_output_data_info_.clear();
for (size_t i = 0; i < inputs.size(); ++i) {
const vector<void *> addr_list = ModelUtils::GetOutputDataAddrs(runtime_param_, data_op_list_[i]);
knonw_input_data_info_[addr_list[kDataIndex]] = inputs[i];
GELOGI("DavinciModel::CreateKnownZeroCopyMap input %zu,v addr %p,p addr %p .", i, addr_list[kDataIndex], inputs[i]);
const vector<void *> &addr_list = input_addrs_list_[i];
void *addr = GetRunAddress(addr_list[kDataIndex]);
known_input_data_info_[addr] = inputs[i];
GELOGI("input %zu, v addr %p, r addr %p, p addr %p", i, addr_list[kDataIndex], addr, inputs[i]);
}
if (output_op_list_.size() < kOutputNum) {
GELOGW("output op num in graph is %zu.", output_op_list_.size());

if (output_addrs_list_.empty()) {
GELOGW("output op num in graph is %zu", output_addrs_list_.size());
return SUCCESS;
}
const vector<void *> addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, output_op_list_[kDataIndex]);
const vector<void *> &addr_list = output_addrs_list_.front();
for (size_t i = 0; i < addr_list.size() && i < outputs.size(); ++i) {
knonw_output_data_info_[addr_list[i]] = outputs[i];
GELOGI("DavinciModel::CreateKnownZeroCopyMap output %zu,v addr %p,p addr %p .", i, addr_list[i], outputs[i]);
void *addr = GetRunAddress(addr_list[i]);
known_output_data_info_[addr] = outputs[i];
GELOGI("output %zu, v addr %p, r addr %p, p addr %p", i, addr_list[i], addr, outputs[i]);
}
GELOGI("DavinciModel::CreateKnownZeroCopyMap success.");

GELOGI("success, known input data info size: %zu, known output data info size: %zu",
known_input_data_info_.size(), known_output_data_info_.size());
return SUCCESS;
}

@@ -2825,40 +2855,30 @@ void DavinciModel::SetTotalIOAddrs(const vector<void *> &io_addrs) {
}

for (size_t i = 0; i < io_addrs.size(); ++i) {
uintptr_t addr = reinterpret_cast<uintptr_t>(io_addrs[i]);
if ((fixed_mem_base_ <= addr) && (addr < fixed_mem_base_ + runtime_param_.mem_size)) {
total_io_addrs_.emplace_back(mem_base_ + (addr - fixed_mem_base_));
} else {
total_io_addrs_.emplace_back(io_addrs[i]);
}
total_io_addrs_.emplace_back(GetRunAddress(io_addrs[i]));
}
}

Status DavinciModel::UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs) {
if (fixed_mem_base_ != reinterpret_cast<uintptr_t>(mem_base_)) {
for (size_t i = 0; i < total_io_addrs.size(); ++i) {
uintptr_t addr = reinterpret_cast<uintptr_t>(total_io_addrs[i]);
if ((fixed_mem_base_ <= addr) && (addr < fixed_mem_base_ + runtime_param_.mem_size)) {
total_io_addrs[i] = mem_base_ + (addr - fixed_mem_base_);
}
total_io_addrs[i] = GetRunAddress(total_io_addrs[i]);
}
}

for (size_t i = 0; i < total_io_addrs.size(); ++i) {
auto it_in = knonw_input_data_info_.find(total_io_addrs[i]);
if (it_in != knonw_input_data_info_.end()) {
GELOGI("DavinciModel::UpdateKnownZeroCopyAddr input %zu,v addr %p,p addr %p .", i, total_io_addrs[i],
knonw_input_data_info_.at(total_io_addrs[i]));
total_io_addrs[i] = knonw_input_data_info_.at(total_io_addrs[i]);
auto it_in = known_input_data_info_.find(total_io_addrs[i]);
if (it_in != known_input_data_info_.end()) {
GELOGI("input %zu, v addr %p, p addr %p", i, total_io_addrs[i], known_input_data_info_.at(total_io_addrs[i]));
total_io_addrs[i] = known_input_data_info_.at(total_io_addrs[i]);
}
auto it_out = knonw_output_data_info_.find(total_io_addrs[i]);
if (it_out != knonw_output_data_info_.end()) {
GELOGI("DavinciModel::UpdateKnownZeroCopyAddr output %zu,v addr %p,p addr %p .", i, total_io_addrs[i],
knonw_output_data_info_.at(total_io_addrs[i]));
total_io_addrs[i] = knonw_output_data_info_.at(total_io_addrs[i]);
auto it_out = known_output_data_info_.find(total_io_addrs[i]);
if (it_out != known_output_data_info_.end()) {
GELOGI("output %zu, v addr %p, p addr %p", i, total_io_addrs[i], known_output_data_info_.at(total_io_addrs[i]));
total_io_addrs[i] = known_output_data_info_.at(total_io_addrs[i]);
}
}
GELOGI("DavinciModel::UpdateKnownZeroCopyAddr success.");
GELOGI("success, total io addrs size: %zu", total_io_addrs.size());
return SUCCESS;
}

@@ -3159,15 +3179,8 @@ bool DavinciModel::CheckInputAndModelSize(const int64_t &input_size, const int64
"MAY cause inference result ERROR, please check model input",
input_size, op_size);
}
bool is_dynamic_aipp = false;
for (const auto &op_desc : data_op_list_) {
if (op_desc->GetType() == AIPP_DATA_TYPE) {
GELOGI("This is dynamic aipp model.");
is_dynamic_aipp = true;
break;
}
}
if (is_dynamic_aipp) {

if (is_dynamic_aipp_) {
GELOGI("This is dynamic aipp model, no need to judge smaller input size");
return true;
}


+ 113
- 91
ge/graph/load/new_model_manager/davinci_model.h View File

@@ -49,6 +49,10 @@
#include "task_info/task_info.h"
#include "graph/common/local_context.h"

using std::mutex;
using std::thread;
using std::multimap;

namespace ge {
// op debug need 2048 bits buffer
const size_t kOpDebugMemorySize = 2048UL;
@@ -84,11 +88,11 @@ struct SuperKernelTaskInfo {
uint32_t last_stream_id;
void *last_stream;
void *last_sm_desc;
std::vector<void *> kernel_list;
std::vector<void *> arg_list;
std::vector<uint32_t> dump_flag_list;
std::vector<OpDescPtr> op_desc_list;
std::vector<uintptr_t> dump_args_list;
vector<void *> kernel_list;
vector<void *> arg_list;
vector<uint32_t> dump_flag_list;
vector<OpDescPtr> op_desc_list;
vector<uintptr_t> dump_args_list;
uint32_t last_dump_flag;
int64_t last_group_key;
uintptr_t last_dump_args;
@@ -123,7 +127,7 @@ class DavinciModel {
/// @brief DavinciModel constructor
/// @author
///
DavinciModel(int32_t priority, const std::shared_ptr<ModelListener> &listener);
DavinciModel(int32_t priority, const shared_ptr<ModelListener> &listener);

///
/// @ingroup ge
@@ -153,7 +157,7 @@ class DavinciModel {
/// @param [in] output_que_ids: input queue ids from user, nums equal NetOutput Op.
/// @return: 0 for success / others for fail
///
Status SetQueIds(const std::vector<uint32_t> &input_queue_ids, const std::vector<uint32_t> &output_queue_ids);
Status SetQueIds(const vector<uint32_t> &input_queue_ids, const vector<uint32_t> &output_queue_ids);

///
/// @ingroup ge
@@ -223,13 +227,14 @@ class DavinciModel {
// get total mem size
size_t TotalMemSize() const { return runtime_param_.mem_size; }

const std::map<uint32_t, MemInfo> &P2PMemInfos() const {return runtime_param_.memory_infos;}
const map<uint32_t, MemInfo> &P2PMemInfos() const { return runtime_param_.memory_infos; }

// model name
string Name() const { return name_; }

// om_name
string OmName() const { return om_name_; }

// version
uint32_t Version() const { return version_; }

@@ -255,9 +260,6 @@ class DavinciModel {

Status DestroyThread();

// Get Data Op.
const vector<OpDescPtr> &GetDataList() const { return data_op_list_; }

// get Op
OpDescPtr GetOpByIndex(uint32_t index) const {
if (op_list_.find(index) == op_list_.end()) {
@@ -274,11 +276,12 @@ class DavinciModel {
}
return nullptr;
}

// get task info for profiling
const std::vector<TaskDescInfo> &GetTaskDescInfo() const { return task_desc_info_; }
const vector<TaskDescInfo> &GetTaskDescInfo() const { return task_desc_info_; }

// get updated task info list
std::vector<TaskInfoPtr> GetTaskList() { return task_list_; }
vector<TaskInfoPtr> GetTaskList() { return task_list_; }

// Modified from KernelTaskInfo.
SuperKernelTaskInfo &GetSuperKernelTaskInfo() { return skt_info_; }
@@ -323,7 +326,7 @@ class DavinciModel {
Status GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<InputOutputDescInfo> &output_desc);

Status GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<InputOutputDescInfo> &output_desc,
std::vector<uint32_t> &inputFormats, std::vector<uint32_t> &output_formats);
vector<uint32_t> &inputFormats, vector<uint32_t> &output_formats);

///
/// @ingroup ge
@@ -332,7 +335,7 @@ class DavinciModel {
/// @param [out] dynamic_type
/// @return execute result
///
Status GetDynamicBatchInfo(std::vector<std::vector<int64_t>> &batch_info, int32_t &dynamic_type) const;
Status GetDynamicBatchInfo(vector<vector<int64_t>> &batch_info, int32_t &dynamic_type) const;

///
/// @ingroup ge
@@ -340,13 +343,13 @@ class DavinciModel {
/// @param [out] batch_info
/// @return None
///
void GetCombinedDynamicDims(std::vector<std::vector<int64_t>> &batch_info) const;
void GetCombinedDynamicDims(vector<vector<int64_t>> &batch_info) const;

void GetUserDesignateShapeOrder(std::vector<std::string> &user_input_shape_order) const;
void GetUserDesignateShapeOrder(vector<string> &user_input_shape_order) const;

void GetCurShape(std::vector<int64_t> &batch_info, int32_t &dynamic_type);
void GetCurShape(vector<int64_t> &batch_info, int32_t &dynamic_type);

void GetModelAttr(std::vector<std::string> &dynamic_output_shape_info);
void GetModelAttr(vector<string> &dynamic_output_shape_info);

///
/// @ingroup ge
@@ -373,7 +376,7 @@ class DavinciModel {
/// @param [in] string identification: unique identification for current op.
/// @return None
///
void GetUniqueId(const OpDescPtr &op_desc, std::string &unique_identification);
void GetUniqueId(const OpDescPtr &op_desc, string &unique_identification);

///
/// @ingroup ge
@@ -384,7 +387,7 @@ class DavinciModel {
///
Status GetInputOutputDescInfoForZeroCopy(vector<InputOutputDescInfo> &input_desc,
vector<InputOutputDescInfo> &output_desc,
std::vector<uint32_t> &inputFormats, std::vector<uint32_t> &output_formats);
vector<uint32_t> &inputFormats, vector<uint32_t> &output_formats);

Status ReturnResult(uint32_t data_id, const bool rslt_flg, const bool seq_end_flg, OutputData *output_data);

@@ -406,8 +409,6 @@ class DavinciModel {
///
bool RunFlag() const { return run_flg_; }

Status GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats);

///
/// @ingroup ge
/// @brief Set Session Id
@@ -453,14 +454,14 @@ class DavinciModel {
/// @ingroup ge
/// @brief Save outside address of Data or NetOutput used info for ZeroCopy.
/// @param [in] const OpDescPtr &op_desc: current op desc
/// @param [in] const std::vector<void *> &outside_addrs: address of task
/// @param [in] const vector<void *> &outside_addrs: address of task
/// @param [in] const void *args_offset: arguments address save the address.
/// @return None.
///
void SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector<void *> &outside_addrs, const void *info, void *args,
void SetZeroCopyAddr(const OpDescPtr &op_desc, const vector<void *> &outside_addrs, const void *info, void *args,
size_t size, size_t offset);

void SetDynamicSize(const std::vector<uint64_t> &batch_num, int32_t dynamic_type);
void SetDynamicSize(const vector<uint64_t> &batch_num, int32_t dynamic_type);

bool GetL1FusionEnableOption() { return is_l1_fusion_enable_; }

@@ -476,7 +477,7 @@ class DavinciModel {
data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id);
}

void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr<OpDesc> &op_desc, uintptr_t args) {
void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const shared_ptr<OpDesc> &op_desc, uintptr_t args) {
data_dumper_.SaveDumpTask(task_id, stream_id, op_desc, args);
}

@@ -485,7 +486,7 @@ class DavinciModel {

DavinciModel(const DavinciModel &model) = delete;

const map<int64_t, std::vector<rtStream_t>> &GetHcclFolowStream() {
const map<int64_t, vector<rtStream_t>> &GetHcclFolowStream() {
return main_follow_stream_mapping_;
}
void SaveHcclFollowStream(int64_t main_stream_id, rtStream_t stream);
@@ -534,8 +535,8 @@ class DavinciModel {
void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; }

Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info);
Status GetAllAippInputOutputDims(uint32_t index, std::vector<InputOutputDims> &input_dims,
std::vector<InputOutputDims> &output_dims);
Status GetAllAippInputOutputDims(uint32_t index, vector<InputOutputDims> &input_dims,
vector<InputOutputDims> &output_dims);
void SetModelDescVersion(bool is_new_model_desc) { is_new_model_desc_ = is_new_model_desc; }
// om file name
void SetOmName(string om_name) { om_name_ = om_name; }
@@ -546,7 +547,6 @@ class DavinciModel {
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info);
}
Status InitInputOutputForDynamic(const ComputeGraphPtr &compute_graph);

private:
// memory address of weights
@@ -566,6 +566,8 @@ class DavinciModel {
struct timeInfo time_info_;
int32_t dataInputTid;

void *GetRunAddress(void *addr) const;

///
/// @ingroup ge
/// @brief Copy Check input size and model op size.
@@ -603,7 +605,7 @@ class DavinciModel {
/// @param [in] batch_label: batch label for multi-batch scenes
/// @return SUCCESS handle successfully / others handle failed
///
Status UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &data_info, bool is_input,
Status UpdateIoTaskArgs(const map<uint32_t, ZeroCopyOffset> &data_info, bool is_input,
const vector<DataBuffer> &blobs, bool is_dynamic, const string &batch_label);

Status CopyInputData(const InputData &input_data, bool device_data = false);
@@ -619,7 +621,8 @@ class DavinciModel {

void SetInputDimsInfo(const vector<int64_t> &model_input_dims, Format &format, InputOutputDescInfo &input);

Status GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, std::vector<uint32_t> &formats);
Status GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<uint32_t> &input_formats);
Status GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, vector<uint32_t> &output_formats);

Status InitTaskInfo(domi::ModelTaskDef &modelTaskInfo);

@@ -631,7 +634,7 @@ class DavinciModel {

uint8_t *MallocWeightsMem(size_t weights_size);

uint8_t* MallocP2PMem(size_t p2p_data_size);
uint8_t *MallocP2PMem(size_t p2p_data_size);

void FreeFeatureMapMem();

@@ -663,27 +666,33 @@ class DavinciModel {
///
/// @ingroup ge
/// @brief Data Op Initialize.
/// @param [in] ComputeGraphPtr: root graph of the model.
/// @param [in] NodePtr: Data Op.
/// @param [in/out] data_op_index: NetOutput addr size info.
/// @param [in/out] data_op_index: index of courrent count.
/// @param [in/out] data_by_index: Data ordered by index.
/// @return Status
///
Status InitDataOp(const NodePtr &node, uint32_t &data_op_index, map<uint32_t, OpDescPtr> &data_by_index);
Status InitDataOp(const ComputeGraphPtr &graph, const NodePtr &node, uint32_t &data_op_index,
map<uint32_t, OpDescPtr> &data_by_index);

///
/// @ingroup ge
/// @brief Sort Data op list by index.
/// @param [in] data_by_index: map of Data Op.
/// @return
/// @param [in] output_op_list: list of NetOutput op.
/// @return Status
///
void AdjustDataOpList(const map<uint32_t, OpDescPtr> &data_by_index);
Status OptInputOutputInfo(const map<uint32_t, OpDescPtr> &data_by_index, const vector<OpDescPtr> &output_op_list);

///
/// @ingroup ge
/// @brief NetOutput Op Initialize.
/// @param [in] ComputeGraphPtr: root graph of the model.
/// @param [in] NodePtr: NetOutput Op.
/// @param [in/out] vector<OpDescPtr>: All NetOutput node in model.
/// @return Status
///
Status InitNetOutput(const NodePtr &node);
Status InitNetOutput(const ComputeGraphPtr &graph, const NodePtr &node, vector<OpDescPtr> &output_op_list);

///
/// @ingroup ge
@@ -722,7 +731,7 @@ class DavinciModel {
///
Status InitTbeHandle(const OpDescPtr &op_desc);

void StoreTbeHandle(const std::string &handle_key);
void StoreTbeHandle(const string &handle_key);
void CleanTbeHandle();

///
@@ -753,7 +762,7 @@ class DavinciModel {
///
Status BindInputQueue();

Status CpuTaskModelZeroCopy(std::vector<uintptr_t> &mbuf_list, std::map<const void *, ZeroCopyOffset> &outside_addrs);
Status CpuTaskModelZeroCopy(vector<uintptr_t> &mbuf_list, map<const void *, ZeroCopyOffset> &outside_addrs);

///
/// @ingroup ge
@@ -824,7 +833,7 @@ class DavinciModel {

Status DoTaskSink();

void CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputDescInfo &output, uint32_t &format_result);
void CreateOutput(uint32_t index, const OpDescPtr &op_desc, InputOutputDescInfo &output, uint32_t &format_result);

Status TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id);

@@ -838,13 +847,16 @@ class DavinciModel {

Status SinkTimeProfile(const InputData &current_data);

Status GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data_index, OutputData *output_data,
std::vector<ge::OutputTensorInfo> &outputs);
Status InitOutputTensorInfo(const OpDescPtr &op_desc);
Status GenOutputTensorInfo(OutputData *output_data, vector<OutputTensorInfo> &outputs);

void ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_info);
Status InitOutputDescInfo(const vector<OpDescPtr> &output_op_list,
vector<InputOutputDescInfo> &output_desc, vector<uint32_t> &formats);

void ParseAIPPInfo(string in_out_info, InputOutputDims &dims_info);
void SetLabelForDynamic(const NodePtr &node);

void ParseDynamicOutShape(const std::vector<std::string> &str_info, std::vector<vector<int64_t>> &vec_info);
void ParseDynamicOutShape(const vector<string> &str_info, vector<vector<int64_t>> &vec_info);
bool IsGetNextSinkDynamic(const OpDescPtr &op_desc);
void GetAllGearsInfo(const NodePtr &node);
Status GetGetDynamicDimsNodeInfo(const NodePtr &node);
@@ -866,56 +878,54 @@ class DavinciModel {
GeModelPtr ge_model_;

bool need_destroy_aicpu_kernel_{false};
vector<std::string> out_node_name_;
vector<string> out_node_name_;

map<uint32_t, OpDescPtr> op_list_;

// data op_desc
vector<OpDescPtr> data_op_list_;

vector<OpDescPtr> output_op_list_;

vector<OpDescPtr> variable_op_list_;

std::map<uint32_t, ZeroCopyOffset> new_input_data_info_;
std::map<uint32_t, ZeroCopyOffset> new_output_data_info_;
std::map<const void *, ZeroCopyOffset> new_input_outside_addrs_;
std::map<const void *, ZeroCopyOffset> new_output_outside_addrs_;
map<uint32_t, ZeroCopyOffset> new_input_data_info_;
map<uint32_t, ZeroCopyOffset> new_output_data_info_;
map<const void *, ZeroCopyOffset> new_input_outside_addrs_;
map<const void *, ZeroCopyOffset> new_output_outside_addrs_;

std::set<const void *> real_virtual_addrs_;
set<const void *> real_virtual_addrs_;

// output op: save cce op actual needed memory size
vector<int64_t> output_memory_size_list_;

std::thread thread_id_;
thread thread_id_;

std::shared_ptr<ModelListener> listener_;
shared_ptr<ModelListener> listener_;

bool run_flg_;

std::mutex mux_run_flg_;
mutex mux_run_flg_;

int32_t priority_;

vector<rtStream_t> stream_list_;

std::mutex all_hccl_stream_list_mutex_;
mutex all_hccl_stream_list_mutex_;
vector<rtStream_t> all_hccl_stream_list_;

// for reuse hccl_follow_stream
std::mutex capacity_of_stream_mutex_;
std::map<int64_t, std::vector<rtStream_t>> main_follow_stream_mapping_;
mutex capacity_of_stream_mutex_;
map<int64_t, vector<rtStream_t>> main_follow_stream_mapping_;

vector<rtEvent_t> event_list_;

vector<rtLabel_t> label_list_;
set<uint32_t> label_id_indication_;

std::mutex outside_addrs_mutex_;
std::vector<ZeroCopyTask> zero_copy_tasks_; // Task used Data or NetOutput addr.
std::set<const void *> copy_only_addrs_; // Address need copy to original place.
mutex outside_addrs_mutex_;
vector<ZeroCopyTask> zero_copy_tasks_; // Task used Data or NetOutput addr.
set<const void *> copy_only_addrs_; // Address need copy to original place.

std::vector<TaskInfoPtr> task_list_;
vector<TaskInfoPtr> task_list_;
// rt_moodel_handle
rtModel_t rt_model_handle_;

@@ -933,39 +943,39 @@ class DavinciModel {
rtAicpuDeployType_t deploy_type_{AICPU_DEPLOY_RESERVED};

// ACL queue schedule, save queue ids for Init.
std::vector<TaskInfoPtr> cpu_task_list_;
std::vector<uint32_t> input_queue_ids_; // input queue ids created by caller.
std::vector<uint32_t> output_queue_ids_; // output queue ids created by caller.
std::vector<uintptr_t> input_mbuf_list_; // input mbuf created by dequeue task.
std::vector<uintptr_t> output_mbuf_list_; // output mbuf created by dequeue task.
vector<TaskInfoPtr> cpu_task_list_;
vector<uint32_t> input_queue_ids_; // input queue ids created by caller.
vector<uint32_t> output_queue_ids_; // output queue ids created by caller.
vector<uintptr_t> input_mbuf_list_; // input mbuf created by dequeue task.
vector<uintptr_t> output_mbuf_list_; // output mbuf created by dequeue task.

uint64_t session_id_;

uint32_t device_id_;

std::mutex flowctrl_op_index_internal_map_mutex_;
std::map<uint32_t, uint32_t> flowctrl_op_index_internal_map_;
mutex flowctrl_op_index_internal_map_mutex_;
map<uint32_t, uint32_t> flowctrl_op_index_internal_map_;

std::vector<rtStream_t> active_stream_list_;
std::set<uint32_t> active_stream_indication_;
vector<rtStream_t> active_stream_list_;
set<uint32_t> active_stream_indication_;

std::set<uint32_t> hcom_streams_;
set<uint32_t> hcom_streams_;
RuntimeParam runtime_param_;

static std::mutex tvm_bin_mutex_;
std::set<std::string> tvm_bin_kernel_;
static mutex tvm_bin_mutex_;
set<string> tvm_bin_kernel_;

std::map<std::string, uint32_t> used_tbe_handle_map_;
map<string, uint32_t> used_tbe_handle_map_;

// for profiling task and graph info
std::vector<TaskDescInfo> task_desc_info_;
vector<TaskDescInfo> task_desc_info_;

int64_t maxDumpOpNum_;
// for data dump
DataDumper data_dumper_;
uint64_t iterator_count_;
bool is_l1_fusion_enable_;
std::map<OpDescPtr, void *> saved_task_addrs_;
map<OpDescPtr, void *> saved_task_addrs_;
void *l1_fusion_addr_ = nullptr;

bool known_node_ = false;
@@ -976,14 +986,14 @@ class DavinciModel {
void *hybrid_addrs_ = nullptr;
uint32_t total_hybrid_args_size_ = 0;
int64_t total_fixed_addr_size_ = 0;
std::map<const void *, void *> knonw_input_data_info_;
std::map<const void *, void *> knonw_output_data_info_;
map<const void *, void *> known_input_data_info_;
map<const void *, void *> known_output_data_info_;
vector<void *> total_io_addrs_;
vector<void *> orig_total_io_addrs_;
bool base_addr_not_changed_ = false;

vector<vector<int64_t>> batch_info_;
std::vector<std::vector<int64_t>> combined_batch_info_;
vector<vector<int64_t>> combined_batch_info_;
vector<string> user_designate_shape_order_;
int32_t dynamic_type_ = 0;
bool is_dynamic_ = false;
@@ -991,35 +1001,47 @@ class DavinciModel {
vector<uint64_t> batch_size_;
// key: input tensor name, generally rts op;
// value: the fixed addr of input anchor, same as the peer output anchor addr of the peer op
std::map<string, int64_t> tensor_name_to_fixed_addr_size_;
map<string, int64_t> tensor_name_to_fixed_addr_size_;

// key: input tensor name, generally rts op; value: the peer output anchor of the peer op
std::map<string, int64_t> tensor_name_to_peer_output_index_;
map<string, int64_t> tensor_name_to_peer_output_index_;
// if model is first execute
bool is_first_execute_;
// for op debug
std::mutex debug_reg_mutex_;
mutex debug_reg_mutex_;
bool is_op_debug_reg_ = false;
void *op_debug_addr_ = nullptr;
void *p2p_debug_addr_ = nullptr;
bool is_new_model_desc_{false};
bool is_online_infer_dynamic_ = false;
bool is_getnext_sink_dynamic_ = false;
std::vector<int64_t> cur_dynamic_dims_;
vector<int64_t> cur_dynamic_dims_;
void *netoutput_last_input_addr_ = nullptr;
int64_t netoutput_last_input_size_ = 0;
size_t shape_of_cur_dynamic_dims_ = 0;
// key: input_index: input is merge node; value: each gear info and each output size
std::map<size_t, std::map<vector<int64_t>, int64_t>> merge_nodes_gear_and_real_out_size_info_;
map<size_t, map<vector<int64_t>, int64_t>> merge_nodes_gear_and_real_out_size_info_;
// key: input_index: input is merge node; value: each gear info and each output shape
std::map<size_t, std::map<vector<int64_t>, vector<int64_t>>> merge_nodes_gear_and_real_out_shape_info_;
std::vector<std::vector<int64_t>> all_gears_info_;
map<size_t, map<vector<int64_t>, vector<int64_t>>> merge_nodes_gear_and_real_out_shape_info_;
vector<vector<int64_t>> all_gears_info_;

std::multimap<uint32_t, uint32_t> op_id_map_;
std::vector<ProfileInfo> profile_list_;
multimap<uint32_t, uint32_t> op_id_map_;
vector<ProfileInfo> profile_list_;

// For super kernel.
SuperKernelTaskInfo skt_info_;

bool is_dynamic_aipp_ = false;
vector<string> dynamic_output_shape_info_;

vector<vector<void *>> input_addrs_list_;
vector<vector<void *>> output_addrs_list_;

vector<int64_t> output_buffer_size_;
vector<vector<int64_t>> output_shape_info_;

vector<InputOutputDescInfo> output_descs_;
vector<uint32_t> output_formats_;
};
} // namespace ge
#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_

+ 1
- 0
tests/ut/ge/CMakeLists.txt View File

@@ -565,6 +565,7 @@ set(DISTINCT_GRAPH_LOAD_TEST_FILES
"graph/load/end_graph_task_unittest.cc"
"graph/load/new_model_manager_event_manager_unittest.cc"
#"graph/load/output_net_output_unittest.cc"
"graph/load/davinci_model_unittest.cc"
"graph/load/tbe_handle_store_unittest.cc"
"graph/load/hccl_task_info_unittest.cc"
"graph/load/kernel_ex_task_info_unittest.cc"


+ 285
- 0
tests/ut/ge/graph/load/davinci_model_unittest.cc View File

@@ -0,0 +1,285 @@
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <gtest/gtest.h>

#define private public
#define protected public
#include "graph/utils/graph_utils.h"
#include "common/profiling/profiling_manager.h"
#include "graph/load/new_model_manager/davinci_model.h"

using namespace std;

namespace ge {
extern OpDescPtr CreateOpDesc(string name, string type);

class UtestDavinciModel : public testing::Test {
protected:
void SetUp() {}

void TearDown() {}
};

TEST_F(UtestDavinciModel, init_success) {
DavinciModel model(0, nullptr);
ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
ProfilingManager::Instance().is_load_profiling_ = true;

GeModelPtr ge_model = make_shared<GeModel>();
ge_model->SetGraph(GraphUtils::CreateGraphFromComputeGraph(graph));
AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, 5120000);
AttrUtils::SetInt(ge_model, ATTR_MODEL_STREAM_NUM, 1);

shared_ptr<domi::ModelTaskDef> model_task_def = make_shared<domi::ModelTaskDef>();
ge_model->SetModelTaskDef(model_task_def);

GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
TensorUtils::SetSize(tensor, 512);

OpDescPtr op_input = CreateOpDesc("data", DATA);
op_input->AddInputDesc(tensor);
op_input->AddOutputDesc(tensor);
op_input->SetInputOffset({1024});
op_input->SetOutputOffset({1024});
NodePtr node_input = graph->AddNode(op_input); // op_index = 0

OpDescPtr op_kernel = CreateOpDesc("square", "Square");
op_kernel->AddInputDesc(tensor);
op_kernel->AddOutputDesc(tensor);
op_kernel->SetInputOffset({1024});
op_kernel->SetOutputOffset({1024});
NodePtr node_kernel = graph->AddNode(op_kernel); // op_index = 1

OpDescPtr op_memcpy = CreateOpDesc("memcpy", MEMCPYASYNC);
op_memcpy->AddInputDesc(tensor);
op_memcpy->AddOutputDesc(tensor);
op_memcpy->SetInputOffset({1024});
op_memcpy->SetOutputOffset({5120});
NodePtr node_memcpy = graph->AddNode(op_memcpy); // op_index = 2

OpDescPtr op_output = CreateOpDesc("output", NETOUTPUT);
op_output->AddInputDesc(tensor);
op_output->SetInputOffset({5120});
op_output->SetSrcName( { "memcpy" } );
op_output->SetSrcIndex( { 0 } );
NodePtr node_output = graph->AddNode(op_output); // op_index = 3


domi::TaskDef *task_def1 = model_task_def->add_task();
task_def1->set_stream_id(0);
task_def1->set_type(RT_MODEL_TASK_KERNEL);
domi::KernelDef *kernel_def = task_def1->mutable_kernel();
kernel_def->set_stub_func("stub_func");
kernel_def->set_args_size(64);
string args(64, '1');
kernel_def->set_args(args.data(), 64);
domi::KernelContext *context = kernel_def->mutable_context();
context->set_op_index(1);
context->set_kernel_type(2); // ccKernelType::TE
uint16_t args_offset[9] = {0};
context->set_args_offset(args_offset, 9 * sizeof(uint16_t));

domi::TaskDef *task_def2 = model_task_def->add_task();
task_def2->set_stream_id(0);
task_def2->set_type(RT_MODEL_TASK_MEMCPY_ASYNC);
domi::MemcpyAsyncDef *memcpy_async = task_def2->mutable_memcpy_async();
memcpy_async->set_src(1024);
memcpy_async->set_dst(5120);
memcpy_async->set_dst_max(512);
memcpy_async->set_count(1);
memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
memcpy_async->set_op_index(2);

EXPECT_EQ(model.Assign(ge_model), SUCCESS);
EXPECT_EQ(model.Init(), SUCCESS);

EXPECT_EQ(model.input_addrs_list_.size(), 1);
EXPECT_EQ(model.output_addrs_list_.size(), 1);
EXPECT_EQ(model.task_list_.size(), 2);

ProfilingManager::Instance().is_load_profiling_ = false;
}

TEST_F(UtestDavinciModel, init_data_op) {
DavinciModel model(0, nullptr);
model.ge_model_ = make_shared<GeModel>();
model.runtime_param_.mem_base = (uint8_t *)0x08000000;
model.runtime_param_.mem_size = 5120000;
ComputeGraphPtr graph = make_shared<ComputeGraph>("default");

OpDescPtr op_input = CreateOpDesc("data", DATA);
GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
TensorUtils::SetSize(tensor, 512);
op_input->AddInputDesc(tensor);
op_input->AddOutputDesc(tensor);
op_input->SetInputOffset({1024});
op_input->SetOutputOffset({5120});
NodePtr node_input = graph->AddNode(op_input);

OpDescPtr op_output = CreateOpDesc("output", NETOUTPUT);
op_output->AddInputDesc(tensor);
op_output->SetInputOffset({1024});
op_output->SetSrcName( { "data" } );
op_output->SetSrcIndex( { 0 } );
NodePtr node_output = graph->AddNode(op_output);

EXPECT_EQ(model.InitNodes(graph), SUCCESS);

EXPECT_EQ(model.input_addrs_list_.size(), 1);
EXPECT_EQ(model.output_addrs_list_.size(), 1);
EXPECT_EQ(model.op_list_.size(), 2);
}

TEST_F(UtestDavinciModel, init_data_op_subgraph) {
DavinciModel model(0, nullptr);
model.runtime_param_.mem_base = (uint8_t *)0x08000000;
model.runtime_param_.mem_size = 5120000;
ComputeGraphPtr graph = make_shared<ComputeGraph>("default");

OpDescPtr op_input = CreateOpDesc("data", DATA);
GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
op_input->AddInputDesc(tensor);
op_input->AddOutputDesc(tensor);
op_input->SetInputOffset({1024});
op_input->SetOutputOffset({5120});
NodePtr node = graph->AddNode(op_input);

uint32_t data_op_index = 0;
map<uint32_t, OpDescPtr> data_by_index;
EXPECT_EQ(model.InitDataOp(nullptr, node, data_op_index, data_by_index), SUCCESS);

EXPECT_EQ(model.input_addrs_list_.size(), 0);
EXPECT_EQ(model.output_addrs_list_.size(), 0);
EXPECT_EQ(data_op_index, 0);
EXPECT_TRUE(data_by_index.empty());
}

TEST_F(UtestDavinciModel, init_netoutput_op_subgraph) {
DavinciModel model(0, nullptr);
model.runtime_param_.mem_base = (uint8_t *)0x08000000;
model.runtime_param_.mem_size = 5120000;
ComputeGraphPtr graph = make_shared<ComputeGraph>("default");

OpDescPtr op_output = CreateOpDesc("output", NETOUTPUT);
GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
op_output->AddInputDesc(tensor);
op_output->SetInputOffset({1024});
op_output->SetSrcName( { "data" } );
op_output->SetSrcIndex( { 0 } );
NodePtr node = graph->AddNode(op_output);

std::vector<OpDescPtr> output_op_list;
EXPECT_EQ(model.InitNetOutput(nullptr, node, output_op_list), SUCCESS);

EXPECT_EQ(model.input_addrs_list_.size(), 0);
EXPECT_EQ(model.output_addrs_list_.size(), 0);
EXPECT_TRUE(output_op_list.empty());
}

TEST_F(UtestDavinciModel, init_unknown) {
DavinciModel model(0, nullptr);
model.SetKnownNode(true);
ComputeGraphPtr graph = make_shared<ComputeGraph>("default");

GeModelPtr ge_model = make_shared<GeModel>();
ge_model->SetGraph(GraphUtils::CreateGraphFromComputeGraph(graph));
AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, 5120000);
AttrUtils::SetInt(ge_model, ATTR_MODEL_STREAM_NUM, 1);

shared_ptr<domi::ModelTaskDef> model_task_def = make_shared<domi::ModelTaskDef>();
ge_model->SetModelTaskDef(model_task_def);

GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
TensorUtils::SetSize(tensor, 512);

OpDescPtr op_input = CreateOpDesc("data", DATA);
op_input->AddInputDesc(tensor);
op_input->AddOutputDesc(tensor);
op_input->SetInputOffset({1024});
op_input->SetOutputOffset({1024});
NodePtr node_input = graph->AddNode(op_input); // op_index = 0

OpDescPtr op_kernel = CreateOpDesc("square", "Square");
op_kernel->AddInputDesc(tensor);
op_kernel->AddOutputDesc(tensor);
op_kernel->SetInputOffset({1024});
op_kernel->SetOutputOffset({1024});
NodePtr node_kernel = graph->AddNode(op_kernel); // op_index = 1

OpDescPtr op_memcpy = CreateOpDesc("memcpy", MEMCPYASYNC);
op_memcpy->AddInputDesc(tensor);
op_memcpy->AddOutputDesc(tensor);
op_memcpy->SetInputOffset({1024});
op_memcpy->SetOutputOffset({5120});
NodePtr node_memcpy = graph->AddNode(op_memcpy); // op_index = 2

OpDescPtr op_output = CreateOpDesc("output", NETOUTPUT);
op_output->AddInputDesc(tensor);
op_output->SetInputOffset({5120});
op_output->SetSrcName( { "memcpy" } );
op_output->SetSrcIndex( { 0 } );
NodePtr node_output = graph->AddNode(op_output); // op_index = 3


domi::TaskDef *task_def1 = model_task_def->add_task();
task_def1->set_stream_id(0);
task_def1->set_type(RT_MODEL_TASK_KERNEL);
domi::KernelDef *kernel_def = task_def1->mutable_kernel();
kernel_def->set_stub_func("stub_func");
kernel_def->set_args_size(64);
string args(64, '1');
kernel_def->set_args(args.data(), 64);
domi::KernelContext *context = kernel_def->mutable_context();
context->set_op_index(1);
context->set_kernel_type(2); // ccKernelType::TE
uint16_t args_offset[9] = {0};
context->set_args_offset(args_offset, 9 * sizeof(uint16_t));

domi::TaskDef *task_def2 = model_task_def->add_task();
task_def2->set_stream_id(0);
task_def2->set_type(RT_MODEL_TASK_MEMCPY_ASYNC);
domi::MemcpyAsyncDef *memcpy_async = task_def2->mutable_memcpy_async();
memcpy_async->set_src(1024);
memcpy_async->set_dst(5120);
memcpy_async->set_dst_max(512);
memcpy_async->set_count(1);
memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
memcpy_async->set_op_index(2);

EXPECT_EQ(model.Assign(ge_model), SUCCESS);
EXPECT_EQ(model.Init(), SUCCESS);

EXPECT_EQ(model.input_addrs_list_.size(), 1);
EXPECT_EQ(model.output_addrs_list_.size(), 1);
EXPECT_EQ(model.task_list_.size(), 2);

EXPECT_EQ(model.task_list_[0]->UpdateArgs(), SUCCESS);
EXPECT_EQ(model.task_list_[1]->UpdateArgs(), SUCCESS);

vector<string> out_shape_info;
model.GetModelAttr(out_shape_info);

vector<InputOutputDescInfo> input_descs;
vector<InputOutputDescInfo> output_descs;
EXPECT_EQ(model.GetInputOutputDescInfo(input_descs, output_descs), SUCCESS);

int32_t virtual_addr = 0;
const vector<void *> inputs = { &virtual_addr };
const vector<void *> outputs = { &virtual_addr };
EXPECT_EQ(model.UpdateKnownNodeArgs(inputs, outputs), SUCCESS);
}
} // namespace ge

Loading…
Cancel
Save