Browse Source

!2026 bugfix for taskdefs random variation in offline case

Merge pull request !2026 from gengchao/om
tags/v1.5.1
i-robot Gitee 3 years ago
parent
commit
4302e526bc
2 changed files with 81 additions and 4 deletions
  1. +5
    -3
      ge/graph/build/task_generator.cc
  2. +76
    -1
      tests/ut/ge/graph/build/task_generator_unittest.cc

+ 5
- 3
ge/graph/build/task_generator.cc View File

@@ -50,6 +50,7 @@ const char *const kIsInputVar = "INPUT_IS_VAR";
const char *const kIsOutputVar = "OUTPUT_IS_VAR"; const char *const kIsOutputVar = "OUTPUT_IS_VAR";
const char *const kProfilingMode = "PROFILING_MODE"; const char *const kProfilingMode = "PROFILING_MODE";
const char *const kIteratorV2 = "IteratorV2"; const char *const kIteratorV2 = "IteratorV2";
const char *const kKernelInfoNameHccl = "ops_kernel_info_hccl";
const uint32_t kProfilingArStep = 2; const uint32_t kProfilingArStep = 2;
const uint64_t kProfilingFpStartLogid = 1; const uint64_t kProfilingFpStartLogid = 1;
const uint64_t kProfilingBpEndLogid = 2; const uint64_t kProfilingBpEndLogid = 2;
@@ -437,14 +438,15 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
} }


// Reset stream id to ge stream id, as graph load must use ge stream to reassign stream // Reset stream id to ge stream id, as graph load must use ge stream to reassign stream
void *ops_kernel_info_store_ptr = kernel_info_store.get();
for (size_t idx = task_list_size_before; idx < task_list_size_after; ++idx) { for (size_t idx = task_list_size_before; idx < task_list_size_after; ++idx) {
task_def_list[idx].set_stream_id(static_cast<uint32_t>(stream_id)); task_def_list[idx].set_stream_id(static_cast<uint32_t>(stream_id));
op_name_map[idx] = name; op_name_map[idx] = name;
// Set opsKernelInfoStorePtr and op_index, the two fields be use in DistributeTask and InitTaskInfo
TaskDef *task_def_ptr = &task_def_list[idx]; TaskDef *task_def_ptr = &task_def_list[idx];
GE_CHECK_NOTNULL(task_def_ptr); GE_CHECK_NOTNULL(task_def_ptr);
task_def_ptr->set_ops_kernel_store_ptr(reinterpret_cast<uintptr_t>(ops_kernel_info_store_ptr));
// Set opsKernelInfoStorePtr for hccl which will be use in DistributeTask and InitTaskInfo
if (op_kernel_lib_name == kKernelInfoNameHccl) {
task_def_ptr->set_ops_kernel_store_ptr(reinterpret_cast<uintptr_t>(kernel_info_store.get()));
}
} }
GELOGD("Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task finished, generate %zu task(s).", GELOGD("Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task finished, generate %zu task(s).",
op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id, op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id,


+ 76
- 1
tests/ut/ge/graph/build/task_generator_unittest.cc View File

@@ -29,6 +29,8 @@


#define protected public #define protected public
#define private public #define private public
#include "init/gelib.h"
#include "ge/opskernel_manager/ops_kernel_builder_manager.h"
#include "graph/build/task_generator.h" #include "graph/build/task_generator.h"
#include "graph/manager/graph_mem_manager.h" #include "graph/manager/graph_mem_manager.h"
#include "graph/manager/graph_var_manager.h" #include "graph/manager/graph_var_manager.h"
@@ -41,9 +43,46 @@ using namespace ge;
namespace { namespace {
const char *const kIsInputVar = "INPUT_IS_VAR"; const char *const kIsInputVar = "INPUT_IS_VAR";
const char *const kIsOutputVar = "OUTPUT_IS_VAR"; const char *const kIsOutputVar = "OUTPUT_IS_VAR";
}
const char *const kKernelInfoNameHccl = "ops_kernel_info_hccl";
} // namespace
class UtestTaskGeneratorTest : public testing::Test { class UtestTaskGeneratorTest : public testing::Test {
public: public:
struct FakeOpsKernelBuilder : OpsKernelBuilder {
FakeOpsKernelBuilder(){};

private:
Status Initialize(const map<std::string, std::string> &options) override {
return SUCCESS;
};
Status Finalize() override {
return SUCCESS;
};
Status CalcOpRunningParam(Node &node) override {
return SUCCESS;
};
Status GenerateTask(const Node &node, RunContext &context, std::vector<domi::TaskDef> &tasks) override {
domi::TaskDef task_def;
tasks.push_back(task_def);
return SUCCESS;
};
};

struct FakeOpsKernelInfoStore : OpsKernelInfoStore {
FakeOpsKernelInfoStore() = default;

private:
Status Initialize(const std::map<std::string, std::string> &options) override {
return SUCCESS;
};
Status Finalize() override {
return SUCCESS;
};
bool CheckSupported(const OpDescPtr &op_desc, std::string &reason) const override {
return true;
};
void GetAllOpsKernelInfo(std::map<std::string, ge::OpInfo> &infos) const override{};
};

ge::ComputeGraphPtr BuildGraphFpProfiling() { ge::ComputeGraphPtr BuildGraphFpProfiling() {
ge::ut::GraphBuilder builder("graph"); ge::ut::GraphBuilder builder("graph");
auto data = builder.AddNode("data", "phony", 1, 1); auto data = builder.AddNode("data", "phony", 1, 1);
@@ -95,6 +134,14 @@ class UtestTaskGeneratorTest : public testing::Test {


return builder.GetGraph(); return builder.GetGraph();
} }
ge::ComputeGraphPtr BuildHcclGraph() {
ge::ut::GraphBuilder builder("graph");
auto hccl_node = builder.AddNode("hccl_phony_node", "HCCL_PHONY", 0, 0);
auto op_desc = hccl_node->GetOpDesc();
op_desc->SetOpKernelLibName(kKernelInfoNameHccl);
op_desc->SetStreamId(0);
return builder.GetGraph();
}


protected: protected:
void SetUp() {} void SetUp() {}
@@ -156,3 +203,31 @@ TEST_F(UtestTaskGeneratorTest, AutoFindBpOpIndex) {
output_desc->SetName("hcom"); output_desc->SetName("hcom");
EXPECT_EQ(task_generator.AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes), SUCCESS); EXPECT_EQ(task_generator.AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes), SUCCESS);
} }

TEST_F(UtestTaskGeneratorTest, GenerateTask) {
map<string, string> options;
Status ret = ge::GELib::Initialize(options);
EXPECT_EQ(ret, SUCCESS);

shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
EXPECT_NE(instance_ptr, nullptr);

OpsKernelInfoStorePtr ops_kernel_info_store_ptr = MakeShared<FakeOpsKernelInfoStore>();
instance_ptr->opsManager_.ops_kernel_store_.insert(make_pair(kKernelInfoNameHccl, ops_kernel_info_store_ptr));

OpsKernelBuilderManager &builder_manager_instance_ptr = ge::OpsKernelBuilderManager::Instance();
OpsKernelBuilderPtr fake_builder = MakeShared<FakeOpsKernelBuilder>();
builder_manager_instance_ptr.ops_kernel_builders_[kKernelInfoNameHccl] = fake_builder;

auto graph = BuildHcclGraph();
TaskGenerator task_generator(nullptr, 0);
RunContext run_context;
run_context.graphStreamList.push_back(static_cast<void *>(ops_kernel_info_store_ptr.get()));
vector<uint32_t> all_reduce_nodes;
vector<domi::TaskDef> task_def_list;
map<uint32_t, string> op_name_map;

EXPECT_EQ(task_generator.GenerateTask(run_context, graph, task_def_list, op_name_map), SUCCESS);
EXPECT_EQ(task_def_list.size(), 1);
EXPECT_EQ(task_def_list[0].ops_kernel_store_ptr(), reinterpret_cast<uintptr_t>(ops_kernel_info_store_ptr.get()));
}

Loading…
Cancel
Save