Browse Source

!1864 Fix mem leak and recursive depth protection.

From: @zhao_zhixuan
Reviewed-by: @xchu42,@ji_chen
Signed-off-by: @ji_chen
tags/v1.3.0
mindspore-ci-bot Gitee 4 years ago
parent
commit
5c51e07d61
9 changed files with 24 additions and 31 deletions
  1. +10
    -2
      ge/common/ge/tbe_plugin_manager.cc
  2. +2
    -1
      ge/common/ge/tbe_plugin_manager.h
  3. +0
    -4
      ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
  4. +5
    -10
      ge/single_op/single_op_model.cc
  5. +1
    -2
      ge/single_op/single_op_model.h
  6. +3
    -5
      ge/single_op/task/aicpu_task_builder.cc
  7. +2
    -2
      ge/single_op/task/aicpu_task_builder.h
  8. +1
    -4
      ge/single_op/task/op_task.cc
  9. +0
    -1
      ge/single_op/task/op_task.h

+ 10
- 2
ge/common/ge/tbe_plugin_manager.cc View File

@@ -104,7 +104,15 @@ void TBEPluginManager::ProcessSoFullName(vector<string> &file_list, string &caff
}
}

void TBEPluginManager::FindParserSo(const string &path, vector<string> &file_list, string &caffe_parser_path) {
void TBEPluginManager::FindParserSo(const string &path, vector<string> &file_list,
string &caffe_parser_path, int recursive_depth) {
static const int kMaxRecursiveDepth = 20; // For recursive depth protection

if (recursive_depth >= kMaxRecursiveDepth) {
GELOGW("Recursive depth is become %d, Please check input!", recursive_depth);
return;
}

// Path, change to absolute path
string real_path = RealPath(path.c_str());
// Plugin path does not exist
@@ -138,7 +146,7 @@ void TBEPluginManager::FindParserSo(const string &path, vector<string> &file_lis
ProcessSoFullName(file_list, caffe_parser_path, full_name, caffe_parser_so_suff, aicpu_so_suff,
aicpu_host_so_suff);
} else {
FindParserSo(full_name, file_list, caffe_parser_path);
FindParserSo(full_name, file_list, caffe_parser_path, recursive_depth + 1);
}
}
mmScandirFree(entries, ret);


+ 2
- 1
ge/common/ge/tbe_plugin_manager.h View File

@@ -57,7 +57,8 @@ class TBEPluginManager {
static void ProcessSoFullName(vector<string> &file_list, string &caffe_parser_path, string &full_name,
const string &caffe_parser_so_suff, const string &aicpu_so_suff,
const string &aicpu_host_so_suff);
static void FindParserSo(const string &path, vector<string> &file_list, string &caffe_parser_path);
static void FindParserSo(const string &path, vector<string> &file_list, string &caffe_parser_path,
int recursive_depth = 0);
static void GetPluginSoFileList(const string &path, vector<string> &file_list, string &caffe_parser_path);
static void GetCustomOpPath(std::string &customop_path);
void LoadCustomOpLib();


+ 0
- 4
ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc View File

@@ -64,10 +64,6 @@ Status AicpuNodeTaskBase::InitExtInfo(const std::string &kernel_ext_info, int64_
GE_CHK_STATUS_RET(aicpu_ext_handle_.UpdateSessionInfoSessionId(session_id),
"[Update][SessionInfoSessionId] failed, session_id:%ld.", session_id);

bool execute_mode = !aicpu_ext_handle_.IsNeedRefreshIOAddr() && !node_item_->is_dynamic;
GE_CHK_STATUS_RET(aicpu_ext_handle_.UpdateExecuteMode(execute_mode),
"[Update][ExecuteMode] failed, node:%s.", node_name_.c_str());

// copy task args buf
GE_CHK_STATUS_RET(AllocTensorBuffer(aicpu_ext_handle_.GetExtInfoLen(), ext_info_addr_dev_),
"[Invoke][AllocTensorBuffer]Node[%s] alloc kernel_ext_info buf failed, size=%zu",


+ 5
- 10
ge/single_op/single_op_model.cc View File

@@ -376,11 +376,10 @@ Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &s
} else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
GELOGD("Building AICPU_TF task");
AiCpuTask *aicpu_task = nullptr;
bool depend_compute_flag = false;
uint64_t singleop_kernel_id = aicpu_kernel_id++;
GELOGI("Build singleOp TfTask, kernel_id = %lu", singleop_kernel_id);
GE_CHK_STATUS_RET_NOLOG(
BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, false, depend_compute_flag, singleop_kernel_id));
BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, singleop_kernel_id));
aicpu_task->SetModelArgs(model_name_, model_id_);
ParseArgTable(aicpu_task, single_op);
single_op.tasks_.emplace_back(aicpu_task);
@@ -457,8 +456,7 @@ Status SingleOpModel::BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask *
return SUCCESS;
}

Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task,
bool dynamic_flag, bool& depend_compute_flag, uint64_t kernel_id) {
Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, uint64_t kernel_id) {
auto iter = op_list_.find(kernel_def.op_index());
if (iter == op_list_.end()) {
GELOGE(ACL_ERROR_GE_INTERNAL_ERROR,
@@ -476,12 +474,11 @@ Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, AiC
return ACL_ERROR_GE_MEMORY_ALLOCATION;
}
auto builder = AiCpuTaskBuilder(iter->second->GetOpDesc(), kernel_def);
auto ret = builder.BuildTask(*aicpu_task, model_params_, dynamic_flag, kernel_id);
auto ret = builder.BuildTask(*aicpu_task, model_params_, kernel_id);
if (ret != SUCCESS) {
GELOGE(ret, "[Build][Task] failed, kernel_id:%lu.", kernel_id);
return ret;
}
depend_compute_flag = (aicpu_task->GetUnknownType() == DEPEND_COMPUTE);

*task = aicpu_task.release();
return SUCCESS;
@@ -628,12 +625,10 @@ Status SingleOpModel::BuildTaskListForDynamicOp(StreamResource *stream_resource,
}
GELOGD("Building AICPU_TF task");
AiCpuTask *aicpu_task = nullptr;
bool depend_compute_flag = false;
uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++;
GELOGI("Build dynamic singleOp TfTask, kernel_id = %lu", dynamic_singleop_kernel_id);
GE_CHK_STATUS_RET_NOLOG(BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, true,
depend_compute_flag, dynamic_singleop_kernel_id));
if (depend_compute_flag) {
GE_CHK_STATUS_RET_NOLOG(BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, dynamic_singleop_kernel_id));
if (aicpu_task->GetUnknownType() == DEPEND_COMPUTE) {
if (i >= tasks.size() - 1) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Task]The copy task of the fourth operator was not found.");
REPORT_INNER_ERROR("E19999", "The copy task of the fourth operator was not found.");


+ 1
- 2
ge/single_op/single_op_model.h View File

@@ -69,8 +69,7 @@ class SingleOpModel {
Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op);
Status BuildTaskListForDynamicOp(StreamResource *stream_resource, DynamicSingleOp &dynamic_single_op);
Status BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task);
Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task,
bool dynamic_flag, bool& depend_compute_flag, uint64_t kernel_id);
Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, uint64_t kernel_id);
Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task, uint64_t kernel_id);
Status BuildModelTaskKernel(StreamResource *stream_resource, const domi::TaskDef &task_def,
DynamicSingleOp &single_op);


+ 3
- 5
ge/single_op/task/aicpu_task_builder.cc View File

@@ -63,7 +63,7 @@ namespace ge {
return SUCCESS;
}

Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag) {
Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param) {
if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size());
@@ -83,9 +83,8 @@ namespace ge {
return SUCCESS;
}

Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param,
bool dynamic_flag, uint64_t kernel_id) {
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param, dynamic_flag));
Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param, uint64_t kernel_id) {
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param));

STR_FWK_OP_KERNEL fwk_op_kernel = {0};
auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel);
@@ -124,7 +123,6 @@ namespace ge {
task.arg_size_ = sizeof(STR_FWK_OP_KERNEL);
task.op_type_ = op_desc_->GetName();
task.task_info_ = kernel_def_.task_info();
task.dynamic_flag_ = dynamic_flag;
task.kernel_id_ = kernel_id;

auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_);


+ 2
- 2
ge/single_op/task/aicpu_task_builder.h View File

@@ -29,12 +29,12 @@ namespace ge {
AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def);
~AiCpuTaskBuilder() = default;

Status BuildTask(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag, uint64_t kernel_id);
Status BuildTask(AiCpuTask &task, const SingleOpModelParam &param, uint64_t kernel_id);

private:
static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel);
Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel);
Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag);
Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param);

const OpDescPtr op_desc_;
const domi::KernelExDef &kernel_def_;


+ 1
- 4
ge/single_op/task/op_task.cc View File

@@ -451,7 +451,6 @@ Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info, uint

GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateSessionInfo(ULLONG_MAX, kernel_id, false),
"[Update][SessionInfo] failed.");
GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(true), "[Update][ExecuteMode] failed.");

GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(), RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(),
@@ -623,9 +622,7 @@ Status AiCpuBaseTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vecto
AiCpuTask::~AiCpuTask() {
FreeHbm(args_);
FreeHbm(io_addr_);
if (dynamic_flag_) {
FreeHbm(workspace_addr_);
}
FreeHbm(workspace_addr_);
FreeHbm(copy_workspace_buf_);
FreeHbm(copy_ioaddr_dev_);
FreeHbm(copy_input_release_flag_dev_);


+ 0
- 1
ge/single_op/task/op_task.h View File

@@ -192,7 +192,6 @@ class AiCpuTask : public AiCpuBaseTask {
// host addr
std::vector<void *> io_addr_host_;

bool dynamic_flag_ = false;
// for copy task
void *copy_task_args_buf_ = nullptr;
void *copy_workspace_buf_ = nullptr;


Loading…
Cancel
Save