Browse Source

support overflow detection when infer

tags/v1.3.0
zhou_lili 3 years ago
parent
commit
563e0994cf
8 changed files with 146 additions and 57 deletions
  1. +71
    -45
      ge/common/dump/dump_manager.cc
  2. +5
    -0
      ge/common/dump/dump_manager.h
  3. +17
    -6
      ge/common/dump/dump_properties.cc
  4. +13
    -2
      ge/common/dump/dump_properties.h
  5. +1
    -1
      ge/graph/load/model_manager/data_dumper.cc
  6. +9
    -3
      ge/hybrid/executor/hybrid_model_async_executor.cc
  7. +1
    -0
      inc/framework/common/ge_types.h
  8. +29
    -0
      tests/ut/ge/common/dump_manager_unittest.cc

+ 71
- 45
ge/common/dump/dump_manager.cc View File

@@ -23,6 +23,7 @@ const char *const kDumpOFF = "OFF";
const char *const kDumpoff = "off";
const char *const kDumpOn = "on";
const uint64_t kInferSessionId = 0;
const uint32_t kAllOverflow = 3;
} // namespace
namespace ge {
FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpManager &DumpManager::GetInstance() {
@@ -30,78 +31,103 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpManager &DumpManager::GetIn
return instance;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf(const DumpConfig &dump_config) {
DumpProperties dump_properties;
std::string dump_status;
std::string dump_path;
std::string dump_mode;
std::string dump_op_switch;

if (dump_config.dump_status.empty()) {
bool DumpManager::NeedDoDump(const DumpConfig &dump_config, DumpProperties &dump_properties) {
if (dump_config.dump_status.empty() && dump_config.dump_debug.empty()) {
dump_properties_map_.emplace(kInferSessionId, dump_properties);
GELOGI("Dump does not open");
return SUCCESS;
return false;
}

dump_status = dump_config.dump_status;
GELOGI("Dump status is %s", dump_status.c_str());
if (dump_config.dump_status == kDumpoff || dump_config.dump_status == kDumpOFF) {
GELOGI("Dump status is %s, dump debug is %s.", dump_config.dump_status.c_str(), dump_config.dump_debug.c_str());
if ((dump_config.dump_status == kDumpoff || dump_config.dump_status == kDumpOFF) &&
dump_config.dump_debug == kDumpoff) {
dump_properties.ClearDumpPropertyValue();
dump_properties_map_.emplace(kInferSessionId, dump_properties);
return SUCCESS;
return false;
}
if (dump_config.dump_status == kDumpOn && dump_config.dump_debug == kDumpOn) {
GELOGW("Not support coexistence of dump debug and dump status.");
return false;
}
dump_properties.SetDumpStatus(dump_status);
return true;
}

dump_op_switch = dump_config.dump_op_switch;
dump_properties.SetDumpOpSwitch(dump_op_switch);
if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) {
dump_properties_map_.emplace(kInferSessionId, dump_properties);
GELOGE(PARAM_INVALID, "[Check][DumpList]Invalid, dump_op_switch is %s",
dump_op_switch.c_str());
REPORT_INNER_ERROR("E19999", "Dump list check invalid, dump_op_switch is %s",
dump_op_switch.c_str());
return PARAM_INVALID;
void DumpManager::SetDumpDebugConf(const DumpConfig &dump_config, DumpProperties &dump_properties) {
if (dump_config.dump_debug == kDumpOn) {
GELOGI("Only do overflow detection, dump debug is %s.", dump_config.dump_debug.c_str());
dump_properties.InitInferOpDebug();
dump_properties.SetOpDebugMode(kAllOverflow);
}
}

if (!dump_config.dump_list.empty()) {
for (auto model_dump : dump_config.dump_list) {
std::string model_name = model_dump.model_name;
GELOGI("Dump model is %s", model_name.c_str());
std::set<std::string> dump_layers;
for (auto layer : model_dump.layers) {
GELOGI("Dump layer is %s in model", layer.c_str());
dump_layers.insert(layer);
}
dump_properties.AddPropertyValue(model_name, dump_layers);
void DumpManager::SetDumpList(const DumpConfig &dump_config, DumpProperties &dump_properties) {
for (const auto &model_dump : dump_config.dump_list) {
std::string model_name = model_dump.model_name;
GELOGI("Dump model is %s", model_name.c_str());
std::set<std::string> dump_layers;
for (const auto &layer : model_dump.layers) {
GELOGI("Dump layer is %s in model", layer.c_str());
dump_layers.insert(layer);
}
dump_properties.AddPropertyValue(model_name, dump_layers);
}
}

Status DumpManager::SetNormalDumpConf(const DumpConfig &dump_config, DumpProperties &dump_properties) {
if (dump_config.dump_status == kDumpOn) {
GELOGI("Only do normal dump process, dump status is %s.", dump_config.dump_status.c_str());
dump_properties.SetDumpStatus(dump_config.dump_status);
std::string dump_op_switch = dump_config.dump_op_switch;
dump_properties.SetDumpOpSwitch(dump_op_switch);
if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) {
dump_properties_map_.emplace(kInferSessionId, dump_properties);
GELOGE(PARAM_INVALID, "[Check][DumpList]Invalid, dump_op_switch is %s", dump_op_switch.c_str());
REPORT_INNER_ERROR("E19999", "Dump list check invalid, dump_op_switch is %s", dump_op_switch.c_str());
return PARAM_INVALID;
}
if (dump_op_switch == kDumpOn) {
GELOGI("Start to dump model and single op,dump op switch is %s", dump_op_switch.c_str());

if (!dump_config.dump_list.empty()) {
if (dump_op_switch == kDumpOn) {
GELOGI("Start to dump model and single op, dump op switch is %s", dump_op_switch.c_str());
} else {
GELOGI("Only dump model, dump op switch is %s", dump_op_switch.c_str());
}
SetDumpList(dump_config, dump_properties);
} else {
GELOGI("Only dump model,dump op switch is %s", dump_op_switch.c_str());
GELOGI("Only dump single op, dump op switch is %s", dump_op_switch.c_str());
}
} else {
GELOGI("Only dump single op,dump op switch is %s", dump_op_switch.c_str());
GELOGI("Dump mode is %s", dump_config.dump_mode.c_str());
dump_properties.SetDumpMode(dump_config.dump_mode);
}
return SUCCESS;
}

dump_path = dump_config.dump_path;
Status DumpManager::SetDumpPath(const DumpConfig &dump_config, DumpProperties &dump_properties) {
std::string dump_path = dump_config.dump_path;
if (dump_path.empty()) {
GELOGE(PARAM_INVALID, "[Check][DumpPath]It is empty");
REPORT_INNER_ERROR("E19999", "Dump path check is empty");
return PARAM_INVALID;
}

if (dump_path[dump_path.size() - 1] != '/') {
dump_path = dump_path + "/";
}
dump_path = dump_path + CurrentTimeInStr() + "/";
GELOGI("Dump path is %s", dump_path.c_str());
dump_properties.SetDumpPath(dump_path);
return SUCCESS;
}

dump_mode = dump_config.dump_mode;
GELOGI("Dump mode is %s", dump_mode.c_str());
dump_properties.SetDumpMode(dump_mode);
FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf(const DumpConfig &dump_config) {
DumpProperties dump_properties;
if (!NeedDoDump(dump_config, dump_properties)) {
GELOGD("No need do dump process.");
return SUCCESS;
}
SetDumpDebugConf(dump_config, dump_properties);
GE_CHK_STATUS_RET(SetNormalDumpConf(dump_config, dump_properties), "[Init][DumpConf] failed when dump status is on.");
GE_CHK_STATUS_RET(SetDumpPath(dump_config, dump_properties), "[Init][DumpPath] failed.");
dump_properties_map_[kInferSessionId] = dump_properties;

return SUCCESS;
}



+ 5
- 0
ge/common/dump/dump_manager.h View File

@@ -34,6 +34,11 @@ class DumpManager {
void RemoveDumpProperties(uint64_t session_id);

private:
bool NeedDoDump(const DumpConfig &dump_config, DumpProperties &dump_properties);
void SetDumpDebugConf(const DumpConfig &dump_config, DumpProperties &dump_properties);
Status SetDumpPath(const DumpConfig &dump_config, DumpProperties &dump_properties);
Status SetNormalDumpConf(const DumpConfig &dump_config, DumpProperties &dump_properties);
void SetDumpList(const DumpConfig &dump_config, DumpProperties &dump_properties);
std::mutex mutex_;
std::map<uint64_t, DumpProperties> dump_properties_map_;
};


+ 17
- 6
ge/common/dump/dump_properties.cc View File

@@ -53,7 +53,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::InitByOpti
dump_path_.clear();
dump_step_.clear();
dump_mode_.clear();
is_op_debug_ = false;
is_train_op_debug_ = false;
is_infer_op_debug_ = false;
op_debug_mode_ = 0;

std::string enable_dump;
@@ -124,7 +125,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::ClearDumpI
dump_mode_.clear();
dump_op_switch_.clear();
dump_status_.clear();
is_op_debug_ = false;
is_train_op_debug_ = false;
is_infer_op_debug_ = false;
op_debug_mode_ = 0;
}

@@ -203,6 +205,14 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperti
return dump_status_;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::InitInferOpDebug() {
is_infer_op_debug_ = true;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetOpDebugMode(const uint32_t &op_debug_mode) {
op_debug_mode_ = op_debug_mode;
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpOpSwitch(
const std::string &dump_op_switch) {
dump_op_switch_ = dump_op_switch;
@@ -237,7 +247,8 @@ void DumpProperties::CopyFrom(const DumpProperties &other) {
dump_op_switch_ = other.dump_op_switch_;

model_dump_properties_map_ = other.model_dump_properties_map_;
is_op_debug_ = other.is_op_debug_;
is_train_op_debug_ = other.is_train_op_debug_;
is_infer_op_debug_ = other.is_infer_op_debug_;
op_debug_mode_ = other.op_debug_mode_;
}
}
@@ -254,15 +265,15 @@ void DumpProperties::SetDumpDebugOptions() {

if (dump_debug_mode == OP_DEBUG_AICORE) {
GELOGD("ge.exec.dumpDebugMode=aicore_overflow, op debug is open.");
is_op_debug_ = true;
is_train_op_debug_ = true;
op_debug_mode_ = kAicoreOverflow;
} else if (dump_debug_mode == OP_DEBUG_ATOMIC) {
GELOGD("ge.exec.dumpDebugMode=atomic_overflow, op debug is open.");
is_op_debug_ = true;
is_train_op_debug_ = true;
op_debug_mode_ = kAtomicOverflow;
} else if (dump_debug_mode == OP_DEBUG_ALL) {
GELOGD("ge.exec.dumpDebugMode=all, op debug is open.");
is_op_debug_ = true;
is_train_op_debug_ = true;
op_debug_mode_ = kAllOverflow;
} else {
GELOGW("ge.exec.dumpDebugMode is invalid.");


+ 13
- 2
ge/common/dump/dump_properties.h View File

@@ -65,16 +65,26 @@ class DumpProperties {

const std::string &GetDumpStatus() const;

void InitInferOpDebug();

bool IsInferOpDebug() const {
return is_infer_op_debug_;
}

void SetDumpOpSwitch(const std::string &dump_op_switch);

const std::string &GetDumpOpSwitch() const;

bool IsOpDebugOpen() const { return is_op_debug_; }
bool IsOpDebugOpen() const {
return is_train_op_debug_ || is_infer_op_debug_;
}

bool IsDumpOpen() const;

bool IsSingleOpNeedDump() const;

void SetOpDebugMode(const uint32_t &op_debug_mode);

uint32_t GetOpDebugMode() const { return op_debug_mode_; }

const std::string &GetEnableDump() const {return enable_dump_;}
@@ -96,7 +106,8 @@ class DumpProperties {
std::string dump_op_switch_;
std::map<std::string, std::set<std::string>> model_dump_properties_map_;

bool is_op_debug_ = false;
bool is_train_op_debug_ = false;
bool is_infer_op_debug_ = false;
uint32_t op_debug_mode_ = 0;
};
}


+ 1
- 1
ge/graph/load/model_manager/data_dumper.cc View File

@@ -663,7 +663,7 @@ Status DataDumper::LoadDumpInfo() {
SetOpDebugIdToAicpu(op_debug_task_id_, op_debug_stream_id_, op_debug_addr_, op_mapping_info);

if (!op_list_.empty() || is_op_debug_ || is_end_graph_) {
auto ret = ExecuteLoadDumpInfo(op_mapping_info);
ret = ExecuteLoadDumpInfo(op_mapping_info);
if (ret != SUCCESS) {
GELOGE(ret, "Execute load dump info failed");
return ret;


+ 9
- 3
ge/hybrid/executor/hybrid_model_async_executor.cc View File

@@ -544,9 +544,15 @@ Status HybridModelAsyncExecutor::DumpOpDebug() {
data_dumper_.SetModelId(model_->GetModelId());
data_dumper_.SetDeviceId(model_->GetDeviceId());
void *global_step = nullptr;
TensorValue *varible_global_step = model_->GetVariable(NODE_NAME_GLOBAL_STEP);
if (varible_global_step != nullptr) {
global_step = const_cast<void *>(varible_global_step->GetData());

if (dump_properties.IsInferOpDebug()) {
GELOGD("Init global step when infer with op debug.");
global_step = executor_->GetContext()->global_step;
} else {
TensorValue *varible_global_step = model_->GetVariable(NODE_NAME_GLOBAL_STEP);
if (varible_global_step != nullptr) {
global_step = const_cast<void *>(varible_global_step->GetData());
}
}

void *loop_per_iter = nullptr;


+ 1
- 0
inc/framework/common/ge_types.h View File

@@ -293,6 +293,7 @@ struct DumpConfig {
std::string dump_mode;
std::string dump_status;
std::string dump_op_switch;
std::string dump_debug;
std::vector<ModelDumpConfig> dump_list;
};
} // namespace ge


+ 29
- 0
tests/ut/ge/common/dump_manager_unittest.cc View File

@@ -67,6 +67,35 @@ TEST_F(UTEST_dump_manager, is_dump_single_op_close_success) {
EXPECT_EQ(ret, ge::SUCCESS);
}

// dump_debug and debug_status are on
TEST_F(UTEST_dump_manager, dump_op_debug_on) {
DumpConfig dump_config;
dump_config.dump_debug = "on";
dump_config.dump_status = "on";
auto ret = DumpManager::GetInstance().SetDumpConf(dump_config);
EXPECT_EQ(ret, ge::SUCCESS);
}

// just dump_status is on
TEST_F(UTEST_dump_manager, dump_status_without_dump_list) {
DumpConfig dump_config;
dump_config.dump_status = "on";
auto ret = DumpManager::GetInstance().SetDumpConf(dump_config);
EXPECT_EQ(ret, ge::PARAM_INVALID);
}

// dump_status is on with dump_list
TEST_F(UTEST_dump_manager, dump_status_with_dump_list) {
DumpConfig dump_config;
dump_config.dump_status = "on";
ModelDumpConfig dump_list;
dump_list.model_name = "test";
dump_list.layers.push_back("first");
dump_config.dump_list.push_back(dump_list);
auto ret = DumpManager::GetInstance().SetDumpConf(dump_config);
EXPECT_EQ(ret, ge::PARAM_INVALID);
}

TEST_F(UTEST_dump_manager, add_dump_properties_success) {
DumpProperties dump_properties;
DumpManager::GetInstance().AddDumpProperties(0, dump_properties);


Loading…
Cancel
Save