You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

graph_partition.cc 64 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "graph/partition/graph_partition.h"
  17. #include <algorithm>
  18. #include <memory>
  19. #include <string>
  20. #include <unordered_set>
  21. #include <vector>
  22. #include "analyzer/analyzer.h"
  23. #include "framework/common/op/ge_op_utils.h"
  24. #include "common/ge_call_wrapper.h"
  25. #include "graph/utils/graph_utils.h"
  26. #include "graph/utils/op_desc_utils.h"
  27. #include "graph/utils/type_utils.h"
  28. #include "init/gelib.h"
  29. namespace {
  30. const char *const kEngineDefaultData = "ENGINE_DEFAULT_DATA";
  31. const char *const kEndType = "End";
  32. const char *const kPlaceHolderType = "PlaceHolder";
  33. const int kOneGraph = 1; // only one graph
  34. const int kRankOne = 1; // order of graph list is 0,1,2,3..., 1 means second order
  35. const int kRankZero = 0; // order of graph list is 0,1,2,3..., 0 means first order
  36. } // namespace
  37. namespace ge {
  38. Status ge::GraphPartitioner::CheckIfEnd2PldEmpty(ge::ComputeGraphPtr &output_merged_compute_graph) {
  39. // only one condition:no data node, one engine, there is only one graph + input graph
  40. if (graph_info_.partitions_.size() == kOneGraph) {
  41. auto partition = (*graph_info_.partitions_.begin());
  42. if (partition.first == nullptr) {
  43. REPORT_INNER_ERROR("E19999", "partition.first is nullptr, check invalid, engine name is %s",
  44. partition.second.c_str());
  45. GELOGE(GE_GRAPH_EMPTY_PARTITION, "[Check][Param] partition.first is null, engine name is %s",
  46. partition.second.c_str());
  47. return FAILED;
  48. }
  49. output_merged_compute_graph = partition.first;
  50. } else { // if placeholder to end map is empty, it should be an exception condition
  51. REPORT_INNER_ERROR("E19999", "partitions size:%zu is not 1, check invalid.", graph_info_.partitions_.size());
  52. GELOGE(GE_GRAPH_EMPTY_PARTITION,
  53. "[Check][Param] placeholder to end map is empty, partitions size:%zu is not 1.",
  54. graph_info_.partitions_.size());
  55. return FAILED;
  56. }
  57. return SUCCESS;
  58. }
  59. Status ge::GraphPartitioner::MergeAllSubGraph(ge::ComputeGraphPtr &output_merged_compute_graph,
  60. const std::vector<SubGraphInfoPtr> &sub_graph_list) {
  61. for (size_t rank = 0; rank < graph_info_.rank_2_partitions_.size(); rank++) {
  62. string temp_stream;
  63. // sub_graph_list index is one ahead of rank_2_partitions_list index
  64. if (rank > 0) {
  65. temp_stream = sub_graph_list[rank - 1]->GetStreamLabel();
  66. }
  67. for (const auto &node : graph_info_.rank_2_partitions_[rank]->GetDirectNode()) {
  68. if (node == nullptr) {
  69. continue;
  70. }
  71. if ((node->GetType() == kEndType) || (node->GetType() == kPlaceHolderType)) {
  72. continue;
  73. }
  74. if (!temp_stream.empty() && !AttrUtils::HasAttr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL)) {
  75. (void)AttrUtils::SetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, temp_stream);
  76. }
  77. if (node->SetOwnerComputeGraph(output_merged_compute_graph) != GRAPH_SUCCESS) {
  78. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph for node:%s failed.", node->GetName().c_str());
  79. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Set][OwnerComputeGraph] failed, node %s", node->GetName().c_str());
  80. return FAILED;
  81. }
  82. (void)output_merged_compute_graph->AddNode(node);
  83. }
  84. }
  85. // get session graph id from subgraph
  86. SetMergedGraphId(output_merged_compute_graph);
  87. return SUCCESS;
  88. }
  89. void ge::GraphPartitioner::SetMergedGraphId(ge::ComputeGraphPtr &output_merged_compute_graph) {
  90. string session_graph_id;
  91. // get session graph id from subgraph
  92. if (graph_info_.rank_2_partitions_.empty() ||
  93. !AttrUtils::GetStr(*(graph_info_.rank_2_partitions_[0]), ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) {
  94. GELOGW("Get graph session_graph_id attr failed.");
  95. }
  96. // set session graph id into merged subgraph
  97. if (!session_graph_id.empty()) {
  98. GELOGI("Set session graph id %s in merged compute graph", session_graph_id.c_str());
  99. // private function, promise output_merged_compute_graph not null
  100. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(*output_merged_compute_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id),
  101. GELOGW("SetStr ATTR_NAME_SESSION_GRAPH_ID failed");)
  102. }
  103. }
  104. Status ge::GraphPartitioner::RemoveNodeAndEdgeBetweenEndPld(ge::ComputeGraphPtr &output_merged_compute_graph,
  105. const std::vector<SubGraphInfoPtr> &sub_graph_list) {
  106. if ((output_merged_compute_graph == nullptr) ||
  107. (MergeAllSubGraph(output_merged_compute_graph, sub_graph_list) != SUCCESS)) {
  108. REPORT_INNER_ERROR("E19999", "output_merged_compute_graph is nullptr or Call MergeAllSubGraph failed.");
  109. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Merge][AllSubGraph] failed.");
  110. return FAILED;
  111. }
  112. for (const auto &it : graph_info_.index_2_end_) {
  113. auto &end = it.second;
  114. auto &pld = graph_info_.end_2_pld_[it.second];
  115. if ((end != nullptr) && (pld != nullptr) && (end->GetInDataAnchor(0) != nullptr) &&
  116. (pld->GetOutDataAnchor(0) != nullptr)) {
  117. AnchorPtr end_in_anchor = (end->GetInDataAnchor(0)->GetFirstPeerAnchor() == nullptr)
  118. ? Anchor::DynamicAnchorCast<Anchor>(end->GetInControlAnchor())
  119. : Anchor::DynamicAnchorCast<Anchor>(end->GetInDataAnchor(0));
  120. AnchorPtr pld_out_anchor = (pld->GetOutDataAnchor(0)->GetFirstPeerAnchor() == nullptr)
  121. ? Anchor::DynamicAnchorCast<Anchor>(pld->GetOutControlAnchor())
  122. : Anchor::DynamicAnchorCast<Anchor>(pld->GetOutDataAnchor(0));
  123. auto src_anchor = end_in_anchor->GetFirstPeerAnchor(); // src_anchor should be only 1
  124. if (GraphUtils::RemoveEdge(src_anchor, end_in_anchor) != GRAPH_SUCCESS) {
  125. REPORT_CALL_ERROR("E19999", "RemoveEdge between %s and %s failed",
  126. src_anchor->GetOwnerNode()->GetName().c_str(),
  127. end_in_anchor->GetOwnerNode()->GetName().c_str());
  128. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Remove][Edge] between %s and %s failed. node_name:%s, graph_name:%s",
  129. src_anchor->GetOwnerNode()->GetName().c_str(), end_in_anchor->GetOwnerNode()->GetName().c_str(),
  130. end->GetName().c_str(), end->GetOwnerComputeGraph()->GetName().c_str());
  131. return FAILED;
  132. }
  133. GE_CHECK_NOTNULL(pld_out_anchor);
  134. for (const auto &peer_in_anchor : pld_out_anchor->GetPeerAnchors()) {
  135. if (GraphUtils::RemoveEdge(pld_out_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
  136. REPORT_CALL_ERROR("E19999", "RemoveEdge between %s and %s failed",
  137. pld_out_anchor->GetOwnerNode()->GetName().c_str(),
  138. peer_in_anchor->GetOwnerNode()->GetName().c_str());
  139. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Remove][Edge] between %s and %s failed. node_name:%s, graph_name:%s",
  140. pld_out_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str(),
  141. pld->GetName().c_str(), pld->GetOwnerComputeGraph()->GetName().c_str());
  142. return FAILED;
  143. }
  144. if (GraphUtils::AddEdge(src_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
  145. REPORT_CALL_ERROR("E19999", "AddEdge from %s to %s failed.",
  146. src_anchor->GetOwnerNode()->GetName().c_str(),
  147. peer_in_anchor->GetOwnerNode()->GetName().c_str());
  148. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Add][Edge] from %s to %s failed.",
  149. src_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str());
  150. return FAILED;
  151. }
  152. }
  153. } else {
  154. GELOGW("End or pld is nullptr or in data anchor of end is nullptr or out data anchor of pld is nullptr");
  155. }
  156. }
  157. return SUCCESS;
  158. }
  159. Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr &output_merged_compute_graph,
  160. const ge::ComputeGraphPtr &original_compute_graph) {
  161. Status real_ret = SUCCESS;
  162. auto ret = MergeSubGraph(output_merged_compute_graph, original_compute_graph);
  163. if (ret != SUCCESS) {
  164. // even though failed, ensure all op do finish check support
  165. real_ret = FAILED;
  166. GELOGE(ret, "[Merge][SubGraph] Failed, ret:%d", ret);
  167. }
  168. GE_CHECK_NOTNULL(original_compute_graph);
  169. output_merged_compute_graph->SetName(original_compute_graph->GetName());
  170. // partition sub graph
  171. for (const auto &sub_graph : original_compute_graph->GetAllSubgraphs()) {
  172. ComputeGraphPtr merged_sub_graph = nullptr;
  173. ret = MergeSubGraph(merged_sub_graph, sub_graph);
  174. if (ret != SUCCESS) {
  175. real_ret = FAILED;
  176. GELOGE(ret, "[Merge][SubGraph] Failed, ret:%d", ret);
  177. continue;
  178. }
  179. // this means subgraph added in optimize subgraph and without partitions, so just add to root graph
  180. if (merged_sub_graph == sub_graph) {
  181. GELOGI("Just add subgraph %s (parent node is %s) to root graph %s.", sub_graph->GetName().c_str(),
  182. sub_graph->GetParentNode()->GetName().c_str(), output_merged_compute_graph->GetName().c_str());
  183. sub_graph->SetParentGraph(sub_graph->GetParentNode()->GetOwnerComputeGraph());
  184. GE_IF_BOOL_EXEC(output_merged_compute_graph->AddSubgraph(sub_graph->GetName(), merged_sub_graph) != SUCCESS,
  185. return FAILED;)
  186. continue;
  187. }
  188. // add sub graph
  189. merged_sub_graph->SetName(sub_graph->GetName());
  190. merged_sub_graph->SetInputSize(sub_graph->GetInputSize());
  191. merged_sub_graph->SetOutputSize(sub_graph->GetOutputSize());
  192. auto parent_node = sub_graph->GetParentNode();
  193. GE_IF_BOOL_EXEC(parent_node == nullptr,
  194. REPORT_INNER_ERROR("E19999", "Parent node of graph:%s is nullptr.",
  195. sub_graph->GetName().c_str());
  196. GELOGE(FAILED, "[Check][Param] Parent node is null, graph name is %s",
  197. sub_graph->GetName().c_str());
  198. return FAILED;)
  199. auto original_graph = parent_node->GetOwnerComputeGraph();
  200. GE_IF_BOOL_EXEC(graph_2_graph_partition_info_.find(original_graph) == graph_2_graph_partition_info_.end(),
  201. REPORT_INNER_ERROR("E19999", "graph:%s not find in graph_2_graph_partition_info_, check invalid.",
  202. original_graph->GetName().c_str());
  203. GELOGE(FAILED, "[Check][Param] Find graph info failed, graph name is %s",
  204. original_graph->GetName().c_str());
  205. return FAILED;)
  206. auto graph_info = graph_2_graph_partition_info_[original_graph];
  207. GE_IF_BOOL_EXEC(graph_info.corresponding_node_in_partitions_.count(parent_node) == 0,
  208. REPORT_INNER_ERROR("E19999", "node:%s not find in corresponding_node_in_partitions_, "
  209. "check invalid", parent_node->GetName().c_str());
  210. GELOGE(FAILED, "[Check][Param] Find corresponding node failed, parent node name is %s",
  211. parent_node->GetName().c_str());
  212. return FAILED;)
  213. auto corresponding_node = graph_info.corresponding_node_in_partitions_[parent_node];
  214. GE_IF_BOOL_EXEC(corresponding_node == nullptr,
  215. REPORT_INNER_ERROR("E19999", "Get null node in corresponding_node_in_partitions_, "
  216. "first node name is %s", parent_node->GetName().c_str());
  217. GELOGE(FAILED, "[Check][Param] Get null node in corresponding_node_in_partitions_, "
  218. "first node name is %s", parent_node->GetName().c_str());
  219. return FAILED;);
  220. merged_sub_graph->SetParentNode(corresponding_node);
  221. auto subgraph_parent_graph = corresponding_node->GetOwnerComputeGraph();
  222. merged_sub_graph->SetParentGraph(subgraph_parent_graph);
  223. ret = output_merged_compute_graph->AddSubgraph(sub_graph->GetName(), merged_sub_graph);
  224. GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, return ret;)
  225. }
  226. ClearAllPartitionData();
  227. if (real_ret != SUCCESS) {
  228. auto root_graph = ge::GraphUtils::FindRootGraph(original_compute_graph);
  229. GE_CHECK_NOTNULL(root_graph);
  230. (void)Analyzer::GetInstance()->SaveAnalyzerDataToFile(root_graph->GetSessionID(), root_graph->GetGraphID());
  231. }
  232. return real_ret;
  233. }
  234. Status ge::GraphPartitioner::MergeSubGraph(ge::ComputeGraphPtr &output_merged_compute_graph,
  235. const ge::ComputeGraphPtr &original_compute_graph) {
  236. if (original_compute_graph == nullptr) {
  237. REPORT_INNER_ERROR("E19999", "Param original_compute_graph is nullptr, check invalid.");
  238. GELOGE(GE_GRAPH_NULL_INPUT, "[Check][Param] original_compute_graph is nullptr.");
  239. return FAILED;
  240. }
  241. if ((graph_2_graph_partition_info_.find(original_compute_graph) == graph_2_graph_partition_info_.end()) ||
  242. (graph_2_subgraph_list_.find(original_compute_graph) == graph_2_subgraph_list_.end())) {
  243. GELOGW("[GraphPartition]: compute_graph has not found, just return original.");
  244. output_merged_compute_graph = original_compute_graph;
  245. return SUCCESS;
  246. }
  247. GraphPartitionInfo &subgraph_info = graph_2_graph_partition_info_[original_compute_graph];
  248. const auto &sub_graph_list = graph_2_subgraph_list_[original_compute_graph];
  249. graph_info_ = subgraph_info;
  250. if (graph_info_.mode_ != kMerging) {
  251. REPORT_INNER_ERROR("E19999", "Cannot call merging in partition mode, as mode != %d", kMerging);
  252. GELOGE(GE_GRAPH_UNSUPPORTED, "[Check][Param] Cannot call merging in partition mode, as mode != %d", kMerging);
  253. return FAILED;
  254. }
  255. GELOGD("Graph merge starts.");
  256. // check input param
  257. for (const auto &it : sub_graph_list) {
  258. if (it == nullptr) {
  259. REPORT_INNER_ERROR("E19999", "sub_graph is nullptr, check invalid.");
  260. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] merging sub-graphs failed, sub-graph is nullptr");
  261. return FAILED;
  262. }
  263. }
  264. bool is_map_empty = graph_info_.end_2_pld_.empty() || graph_info_.pld_2_end_.empty();
  265. if (is_map_empty) {
  266. if (CheckIfEnd2PldEmpty(output_merged_compute_graph) != SUCCESS) {
  267. return FAILED;
  268. }
  269. }
  270. ComputeGraphPtr new_sub_graph = MakeShared<ComputeGraph>(original_compute_graph->GetName());
  271. GE_CHECK_NOTNULL(new_sub_graph);
  272. output_merged_compute_graph = new_sub_graph;
  273. GE_TIMESTAMP_START(MergeSubGraphRemoveNode);
  274. if (RemoveNodeAndEdgeBetweenEndPld(output_merged_compute_graph, sub_graph_list) != ge::SUCCESS) {
  275. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Call][RemoveNodeAndEdgeBetweenEndPld] failed, graph:%s",
  276. output_merged_compute_graph->GetName().c_str());
  277. return FAILED;
  278. }
  279. GE_TIMESTAMP_END(MergeSubGraphRemoveNode, "GraphPartitioner::MergeGraphRemoveNodeAndEdge");
  280. GE_TIMESTAMP_START(MergeSubGraphTopologicalSorting);
  281. Status ret = output_merged_compute_graph->TopologicalSorting();
  282. if (ret != SUCCESS) {
  283. GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[Call][TopologicalSorting] for output_merged_compute_graph:%s failed",
  284. output_merged_compute_graph->GetName().c_str());
  285. return FAILED;
  286. }
  287. GE_TIMESTAMP_END(MergeSubGraphTopologicalSorting, "GraphPartitioner::MergeGraphTopologicalSorting");
  288. // flush all nodes' engine of merged graph
  289. GE_TIMESTAMP_START(MergeSubGraphEnginePlacerRun);
  290. graph_info_.engine_placer_.SetComputeGraph(output_merged_compute_graph);
  291. if (graph_info_.engine_placer_.Run() != SUCCESS) {
  292. GELOGE(GE_GRAPH_INIT_FAILED, "[Call][Run] engine_placer run failed, graph:%s",
  293. output_merged_compute_graph->GetName().c_str());
  294. return FAILED;
  295. }
  296. GE_TIMESTAMP_END(MergeSubGraphEnginePlacerRun, "GraphPartitioner::MergeGraphEnginePlacerRun");
  297. GELOGD("Graph merge ends.");
  298. return SUCCESS;
  299. }
  300. Status ge::GraphPartitioner::UpdatePldOpDesc(const NodePtr &dst_node, int input_index, OpDescPtr &pld_op_desc) {
  301. if ((dst_node == nullptr) || (pld_op_desc == nullptr) || (dst_node->GetOpDesc() == nullptr)) {
  302. REPORT_INNER_ERROR("E19999", "Param dst_node or pld_op_desc or op of dst_node is nullptr, check invalid");
  303. GELOGE(FAILED, "[Check][Param] parameter ptr is null.");
  304. return FAILED;
  305. }
  306. const auto &input_desc = dst_node->GetOpDesc()->GetInputDesc(static_cast<uint32_t>(input_index));
  307. GE_IF_BOOL_EXEC(pld_op_desc->AddOutputDesc(input_desc) != GRAPH_SUCCESS,
  308. REPORT_CALL_ERROR("E19999", "AddOutputDesc to op:%s failed", pld_op_desc->GetName().c_str());
  309. GELOGE(FAILED, "[Add][OutputDesc] to op:%s failed", pld_op_desc->GetName().c_str());
  310. return FAILED;)
  311. if (pld_op_desc->MutableOutputDesc(0) != nullptr) {
  312. ge::TensorUtils::SetRealDimCnt(*(pld_op_desc->MutableOutputDesc(0).get()),
  313. static_cast<uint32_t>(input_desc.GetShape().GetDims().size()));
  314. } else {
  315. REPORT_INNER_ERROR("E19999", "output(0) of op:%s is nullptr, check invalid", pld_op_desc->GetName().c_str());
  316. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Check][Param] output(0) of op:%s is nullptr.",
  317. pld_op_desc->GetName().c_str());
  318. return FAILED;
  319. }
  320. return SUCCESS;
  321. }
  322. Status ge::GraphPartitioner::UpdateEndOpDesc(const NodePtr &src_node, int output_index, OpDescPtr &end_op_desc) {
  323. if ((src_node == nullptr) || (end_op_desc == nullptr) || (src_node->GetOpDesc() == nullptr)) {
  324. REPORT_INNER_ERROR("E19999", "Param src_node or end_op_desc or op of src_node is nullptr, check invalid.");
  325. GELOGE(FAILED, "[Check][Param] parameter ptr is null.");
  326. return FAILED;
  327. }
  328. const auto &output_desc = src_node->GetOpDesc()->GetOutputDesc(static_cast<uint32_t>(output_index));
  329. GE_IF_BOOL_EXEC(end_op_desc->AddInputDesc(output_desc) != GRAPH_SUCCESS,
  330. REPORT_CALL_ERROR("E19999", "AddInputDesc to op:%s failed", end_op_desc->GetName().c_str());
  331. GELOGE(FAILED, "[Add][InputDesc] to op:%s failed", end_op_desc->GetName().c_str());
  332. return FAILED;)
  333. if (end_op_desc->MutableInputDesc(0) != nullptr) {
  334. ge::TensorUtils::SetRealDimCnt(*(end_op_desc->MutableInputDesc(0).get()),
  335. static_cast<uint32_t>(output_desc.GetShape().GetDims().size()));
  336. } else {
  337. REPORT_INNER_ERROR("E19999", "input(0) of op:%s is nullptr, check invalid.", end_op_desc->GetName().c_str());
  338. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Check][Param] input(0) of op:%s is nullptr.",
  339. end_op_desc->GetName().c_str());
  340. return FAILED;
  341. }
  342. return SUCCESS;
  343. }
  344. graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr &out_anchor,
  345. const AnchorPtr &peer_in_anchor,
  346. const ge::ComputeGraphPtr &pld_graph,
  347. const ge::ComputeGraphPtr &end_graph) {
  348. GE_CHECK_NOTNULL(peer_in_anchor);
  349. GE_CHECK_NOTNULL(pld_graph);
  350. GE_CHECK_NOTNULL(out_anchor);
  351. GE_CHECK_NOTNULL(end_graph);
  352. const auto &src_node = out_anchor->GetOwnerNode();
  353. const auto &dst_node = peer_in_anchor->GetOwnerNode();
  354. // link input -> end
  355. string end_name = kEndType + std::to_string(graph_info_.num_of_pld_end_);
  356. auto end_op_desc = MakeShared<OpDesc>(end_graph->GetName() + "_" + end_name, END);
  357. GE_CHECK_NOTNULL(end_op_desc);
  358. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(end_op_desc, "peerIndex", graph_info_.num_of_pld_end_),
  359. GELOGW("SetInt peerIndex failed");)
  360. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(end_op_desc, "parentOpType", dst_node->GetType()),
  361. GELOGW("SetStr parentOpType failed");)
  362. GE_IF_BOOL_EXEC(!end_op_desc->SetExtAttr("parentNode", dst_node),
  363. GELOGW("SetEndExtAttr parentNode failed");)
  364. OpDescPtr dst_node_op_desc = dst_node->GetOpDesc();
  365. GE_CHECK_NOTNULL(dst_node_op_desc);
  366. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(end_op_desc, ATTR_NAME_END_REAR_NODE_ENGINE_NAME,
  367. dst_node_op_desc->GetOpEngineName()), GELOGW("SetStr rearNodeEngineName failed");)
  368. // replace input_desc of end with owner node's desc
  369. int output_index = ge::AnchorUtils::GetIdx(out_anchor);
  370. bool is_need_update_desc = (output_index >= 0) && ((graph_info_.mode_ == kAtomicEnginePartitioning) ||
  371. (graph_info_.mode_ == kCompositeEnginePartitioning));
  372. if (is_need_update_desc) {
  373. if (UpdateEndOpDesc(src_node, output_index, end_op_desc) != SUCCESS) {
  374. GELOGE(GRAPH_PARAM_INVALID, "[Update][EndOpDesc] failed, input index:%d, end_op_desc:%s",
  375. output_index, end_op_desc->GetName().c_str());
  376. return FAILED;
  377. }
  378. } else {
  379. GeTensorDesc input_desc;
  380. if (end_op_desc->AddInputDesc(input_desc) != SUCCESS) {
  381. REPORT_CALL_ERROR("E19999", "add input desc to op:%s failed, input index:%d",
  382. end_op_desc->GetName().c_str(), output_index);
  383. GELOGE(GRAPH_PARAM_INVALID, "[Add][InputDesc] to op:%s failed, input index %d",
  384. end_op_desc->GetName().c_str(), output_index);
  385. return FAILED;
  386. }
  387. }
  388. NodePtr new_end_node = end_graph->AddNode(end_op_desc);
  389. if (new_end_node == nullptr) {
  390. REPORT_CALL_ERROR("E19999", "add node:%s in graph:%s failed",
  391. end_op_desc->GetName().c_str(), end_graph->GetName().c_str());
  392. GELOGE(GRAPH_PARAM_INVALID, "[Add][Node] %s in graph:%s failed.",
  393. end_op_desc->GetName().c_str(), end_graph->GetName().c_str());
  394. return FAILED;
  395. }
  396. GE_IF_BOOL_EXEC(new_end_node->SetOwnerComputeGraph(end_graph) != GRAPH_SUCCESS,
  397. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph %s for node:%s failed",
  398. end_graph->GetName().c_str(), new_end_node->GetName().c_str());
  399. GELOGE(GRAPH_PARAM_INVALID, "[Set][OwnerComputeGraph] %s for node:%s failed",
  400. end_graph->GetName().c_str(), new_end_node->GetName().c_str());
  401. return FAILED;)
  402. AnchorPtr end_dst_anchor = GetEndInAnchor(out_anchor, new_end_node);
  403. if (GraphUtils::AddEdge(out_anchor, end_dst_anchor) != GRAPH_SUCCESS) {
  404. REPORT_CALL_ERROR("E19999", "add edge from %s to %s failed", out_anchor->GetOwnerNode()->GetName().c_str(),
  405. end_dst_anchor->GetOwnerNode()->GetName().c_str());
  406. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Add][Edge] from %s to %s failed",
  407. out_anchor->GetOwnerNode()->GetName().c_str(), end_dst_anchor->GetOwnerNode()->GetName().c_str());
  408. return FAILED;
  409. }
  410. /// For fe, op id has been set in AddNode,
  411. /// we can take op id of srcNode as the mark of parentId now
  412. const auto &src_node_opdesc = src_node->GetOpDesc();
  413. GE_CHECK_NOTNULL(src_node_opdesc);
  414. int64_t node_id = src_node_opdesc->GetId();
  415. const string pld_name = kPlaceHolderType + std::to_string(graph_info_.num_of_pld_end_);
  416. auto pld_op_desc = MakeShared<OpDesc>(pld_graph->GetName() + "_" + pld_name, PLACEHOLDER);
  417. GE_CHECK_NOTNULL(pld_op_desc);
  418. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "peerIndex", graph_info_.num_of_pld_end_),
  419. GELOGW("SetInt peerIndex failed");)
  420. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_peerNodeName", new_end_node->GetName()),
  421. GELOGW("SetStr _peerNodeName failed");)
  422. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentOpType", src_node->GetType()),
  423. GELOGW("SetStr parentOpType failed");)
  424. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_parentNodeName", src_node->GetName()),
  425. GELOGW("SetStr parentOpName failed");)
  426. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentId", end_graph->GetName() + ":" + std::to_string(node_id)),
  427. GELOGW("SetStr parentId failed");)
  428. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "anchorIndex", AnchorUtils::GetIdx(out_anchor)),
  429. GELOGW("SetInt anchorIndex failed");)
  430. GE_IF_BOOL_EXEC(!pld_op_desc->SetExtAttr("parentNode", src_node),
  431. GELOGW("SetPldExtAttr parentNode failed");)
  432. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, ATTR_NAME_PLD_FRONT_NODE_ENGINE_NAME,
  433. src_node_opdesc->GetOpEngineName()), GELOGW("SetStr frontNodeEngineName failed");)
  434. std::string l2_info_attr;
  435. if (AttrUtils::GetStr(src_node_opdesc, "_task_L2FusionInfo", l2_info_attr)) {
  436. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_task_L2FusionInfo", l2_info_attr),
  437. GELOGW("SetStr l2_info_attr failed");)
  438. }
  439. int64_t anchor_index_for_lxfusion;
  440. if (AttrUtils::GetInt(src_node_opdesc, "_data_anchor_index_for_lxfusion", anchor_index_for_lxfusion)) {
  441. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "_data_anchor_index_for_lxfusion", anchor_index_for_lxfusion),
  442. GELOGW("SetInt anchor_index_for_lxfusion failed");)
  443. }
  444. // do not care over flow
  445. graph_info_.num_of_pld_end_++;
  446. // replace output_desc of pld with input node's output desc
  447. int input_index = ge::AnchorUtils::GetIdx(peer_in_anchor);
  448. is_need_update_desc = (input_index >= 0) && ((graph_info_.mode_ == kAtomicEnginePartitioning) ||
  449. (graph_info_.mode_ == kCompositeEnginePartitioning));
  450. if (is_need_update_desc) {
  451. if (UpdatePldOpDesc(dst_node, input_index, pld_op_desc) != SUCCESS) {
  452. GELOGE(GRAPH_PARAM_INVALID, "[Update][PldOpDesc] failed, output index:%d, pld_op_desc:%s",
  453. input_index, pld_op_desc->GetName().c_str());
  454. return FAILED;
  455. }
  456. } else {
  457. GeTensorDesc output_desc;
  458. if (pld_op_desc->AddOutputDesc(output_desc) != SUCCESS) {
  459. REPORT_CALL_ERROR("E19999", "AddOutputDesc to op:%s failed, input index %d",
  460. pld_op_desc->GetName().c_str(), input_index);
  461. GELOGE(GRAPH_PARAM_INVALID, "[Add][OutputDesc] to op:%s failed, input index %d",
  462. pld_op_desc->GetName().c_str(), input_index);
  463. return FAILED;
  464. }
  465. }
  466. NodePtr new_pld_node = pld_graph->AddNode(pld_op_desc);
  467. if (new_pld_node == nullptr) {
  468. REPORT_CALL_ERROR("E19999", "AddNode %s in graph:%s failed.",
  469. pld_op_desc->GetName().c_str(), pld_graph->GetName().c_str());
  470. GELOGE(GRAPH_PARAM_INVALID, "[Add][Node] %s in graph:%s failed.",
  471. pld_op_desc->GetName().c_str(), pld_graph->GetName().c_str());
  472. return FAILED;
  473. }
  474. GE_IF_BOOL_EXEC(new_pld_node->SetOwnerComputeGraph(pld_graph) != GRAPH_SUCCESS,
  475. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph for node:%s failed, graph:%s",
  476. new_pld_node->GetName().c_str(), pld_graph->GetName().c_str());
  477. GELOGE(GRAPH_PARAM_INVALID, "[Set][OwnerComputeGraph] for node:%s failed, graph:%s",
  478. new_pld_node->GetName().c_str(), pld_graph->GetName().c_str());
  479. return FAILED;)
  480. AnchorPtr pld_src_anchor = GetPldOutAnchor(new_pld_node, peer_in_anchor);
  481. // link placeHolder -> computeNode
  482. if (GraphUtils::AddEdge(pld_src_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
  483. REPORT_CALL_ERROR("E19999", "AddEdge from %s to %s failed",
  484. pld_src_anchor->GetOwnerNode()->GetName().c_str(),
  485. peer_in_anchor->GetOwnerNode()->GetName().c_str());
  486. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Add][Edge] from %s to %s failed",
  487. pld_src_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str());
  488. return FAILED;
  489. }
  490. graph_info_.index_2_end_[graph_info_.num_of_pld_end_] = new_end_node;
  491. graph_info_.pld_2_end_[new_pld_node] = new_end_node;
  492. graph_info_.end_2_pld_[new_end_node] = new_pld_node;
  493. return SUCCESS;
  494. }
  495. Status ge::GraphPartitioner::LinkInput2EndRemoveOrginalLink(ge::NodePtr input_node, ge::ComputeGraphPtr src_graph,
  496. ge::ComputeGraphPtr dst_graph) {
  497. if ((input_node == nullptr) || (src_graph == nullptr) || (dst_graph == nullptr)) {
  498. REPORT_INNER_ERROR("E19999", "Param input_node or src_graph or dst_graph is nullptr, check invalid.");
  499. GELOGE(FAILED, "[Check][Param] parameter input_node or src_graph or dst_graph is nullptr.");
  500. return FAILED;
  501. }
  502. // get the original anchors and remove the original link
  503. for (const auto &out_data_anchor : input_node->GetAllOutAnchors()) {
  504. for (auto &peer_in_anchor : out_data_anchor->GetPeerAnchors()) {
  505. if (peer_in_anchor->GetOwnerNode()->GetType() != kEndType) {
  506. if (GraphUtils::RemoveEdge(out_data_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
  507. REPORT_CALL_ERROR("E19999", "RemoveEdge between %s and %s failed.",
  508. out_data_anchor->GetOwnerNode()->GetName().c_str(),
  509. peer_in_anchor->GetOwnerNode()->GetName().c_str());
  510. GELOGE(FAILED, "[Remove][Edge] between %s and %s failed.",
  511. out_data_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str());
  512. return FAILED;
  513. }
  514. // link input -> end
  515. auto ret = AddPlaceHolderEndInSrcDstGraph(out_data_anchor, peer_in_anchor, src_graph, dst_graph);
  516. if (ret != SUCCESS) {
  517. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Call][AddPlaceHolderEndInSrcDstGraph] failed, ret:%d.", ret);
  518. return ret;
  519. }
  520. } else {
  521. auto end_node = peer_in_anchor->GetOwnerNode();
  522. if (GraphUtils::RemoveJustNode(src_graph, end_node) != GRAPH_SUCCESS) {
  523. REPORT_CALL_ERROR("E19999", "RemoveJustNode %s from graph:%s failed.",
  524. end_node->GetName().c_str(), src_graph->GetName().c_str());
  525. GELOGE(FAILED, "[Remove][JustNode] %s from graph:%s failed.",
  526. end_node->GetName().c_str(), src_graph->GetName().c_str());
  527. return FAILED;
  528. }
  529. if (end_node->SetOwnerComputeGraph(dst_graph) != GRAPH_SUCCESS) {
  530. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph for node:%s failed, graph:%s.",
  531. end_node->GetName().c_str(), dst_graph->GetName().c_str());
  532. GELOGE(FAILED, "[Set][OwnerComputeGraph] to node:%s failed, graph:%s.",
  533. end_node->GetName().c_str(), dst_graph->GetName().c_str());
  534. return FAILED;
  535. }
  536. if (dst_graph->AddNode(end_node) == nullptr) {
  537. REPORT_CALL_ERROR("E19999", "AddNode %s in graph:%s failed.",
  538. end_node->GetName().c_str(), dst_graph->GetName().c_str());
  539. GELOGE(FAILED, "[Add][Node] %s in graph:%s failed.",
  540. end_node->GetName().c_str(), dst_graph->GetName().c_str());
  541. return FAILED;
  542. }
  543. }
  544. }
  545. }
  546. return SUCCESS;
  547. }
  548. Status ge::GraphPartitioner::PutInputNodesInSubGraph(const ge::ComputeGraphPtr &src_graph,
  549. const ge::ComputeGraphPtr &dst_graph) {
  550. if ((src_graph == nullptr) || (dst_graph == nullptr)) {
  551. REPORT_INNER_ERROR("E19999", "Param src_graph or dst_graph is nullptr, check invalid.");
  552. GELOGE(FAILED, "[Check][Param] parameter src_graph or dst_graph is nullptr.");
  553. return FAILED;
  554. }
  555. for (auto &input_node : src_graph->GetDirectNode()) {
  556. if (IsDataLike(input_node)) {
  557. if (input_node->SetOwnerComputeGraph(dst_graph) != GRAPH_SUCCESS) {
  558. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph for node:%s failed, graph:%s.",
  559. input_node->GetName().c_str(), dst_graph->GetName().c_str());
  560. GELOGE(FAILED, "[Set][OwnerComputeGraph] for node:%s failed, graph:%s.",
  561. input_node->GetName().c_str(), dst_graph->GetName().c_str());
  562. return FAILED;
  563. }
  564. // remove input node from src_graph
  565. if (GraphUtils::RemoveJustNode(src_graph, input_node) != GRAPH_SUCCESS) {
  566. REPORT_CALL_ERROR("E19999", "RemoveJustNode %s from graph:%s failed.",
  567. input_node->GetName().c_str(), src_graph->GetName().c_str());
  568. GELOGE(FAILED, "[Remove][JustNode] %s from graph:%s failed.",
  569. input_node->GetName().c_str(), src_graph->GetName().c_str());
  570. return FAILED;
  571. }
  572. // add input node to dst_graph
  573. if (dst_graph->AddNode(input_node) == nullptr) {
  574. REPORT_CALL_ERROR("E19999", "AddNode %s in graph:%s failed.",
  575. input_node->GetName().c_str(), src_graph->GetName().c_str());
  576. GELOGE(FAILED, "[Add][Node] %s in graph:%s failed.",
  577. input_node->GetName().c_str(), src_graph->GetName().c_str());
  578. return FAILED;
  579. }
  580. if (LinkInput2EndRemoveOrginalLink(input_node, src_graph, dst_graph) != ge::SUCCESS) {
  581. GELOGE(FAILED, "[Call][LinkInput2EndRemoveOrginalLink] failed.");
  582. return FAILED;
  583. }
  584. }
  585. }
  586. return SUCCESS;
  587. }
  588. void ge::GraphPartitioner::AddNewGraphToPartition(ge::ComputeGraphPtr &input_graph, const std::string &engine_name) {
  589. if (input_graph == nullptr) {
  590. GELOGW("[GraphPartitioner]: input_graph is null, engine name is %s", engine_name.c_str());
  591. return;
  592. }
  593. graph_info_.partitions_[input_graph] = engine_name;
  594. }
  595. bool ge::GraphPartitioner::IsDataLike(ge::NodePtr node) {
  596. return (node->GetType() == CONSTANT) || (node->GetType() == DATA) || (node->GetType() == AIPPDATA) ||
  597. (node->GetType() == CONSTANTOP) || (node->GetType() == VARIABLE);
  598. }
  599. bool ge::GraphPartitioner::HasNoInput(ge::NodePtr node) {
  600. if (node == nullptr) {
  601. GELOGE(FAILED, "[Check][Param] node is nullptr.");
  602. return true;
  603. }
  604. return node->GetInNodes().empty();
  605. }
  606. Status ge::GraphPartitioner::Initialize(ge::ComputeGraphPtr compute_graph) {
  607. GELOGI("Initialize starts.");
  608. GE_CHECK_NOTNULL(compute_graph);
  609. const auto &node_engine_map = GetNodeEngineMap();
  610. size_t temp_index = 0;
  611. // travese nodes by topo order one by one
  612. for (const auto &node : compute_graph->GetDirectNode()) {
  613. std::string temp_stream;
  614. // node opdesc has been checked before
  615. (void)AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, temp_stream);
  616. ClusterPtr new_cluster;
  617. // data like node without input should be handle specific
  618. if (HasNoInput(node) && IsDataLike(node)) {
  619. ClusterPtr cluster = MakeShared<Cluster>(temp_index, kEngineDefaultData, temp_stream);
  620. new_cluster = cluster;
  621. } else {
  622. if (node_engine_map.count(node) == 0) {
  623. REPORT_INNER_ERROR("E19999", "node:%s not find in node_engine_map", node->GetName().c_str());
  624. GELOGE(FAILED, "[Check][Param] node[%s] does not owner engine!", node->GetName().c_str());
  625. return FAILED;
  626. }
  627. ClusterPtr cluster = MakeShared<Cluster>(temp_index, node_engine_map.at(node), temp_stream);
  628. new_cluster = cluster;
  629. }
  630. if (new_cluster == nullptr) {
  631. REPORT_CALL_ERROR("E19999", "Allocate Cluster failed, index:%zu", temp_index);
  632. GELOGE(FAILED, "[Allocate][Cluster] failed, index:%zu", temp_index);
  633. return FAILED;
  634. }
  635. new_cluster->nodes_.push_back(node);
  636. if (!HasNoInput(node)) {
  637. auto node_id = node->GetOpDesc()->GetId();
  638. for (const auto &parent : node->GetInAllNodes()) {
  639. auto parent_id = parent->GetOpDesc()->GetId();
  640. if (parent_id < node_id) {
  641. auto iter = graph_info_.node_2_cluster_.find(parent);
  642. if (iter == graph_info_.node_2_cluster_.end()) {
  643. REPORT_INNER_ERROR("E19999", "node[%s]id[%ld]'s parent_node[%s]id[%ld] should make cluster in advance",
  644. node->GetOpDesc()->GetName().c_str(), node_id,
  645. parent->GetOpDesc()->GetName().c_str(), parent_id);
  646. GELOGE(FAILED, "[Check][Param] node[%s]id[%ld]'s parent_node[%s]id[%ld] should make cluster in advance",
  647. node->GetOpDesc()->GetName().c_str(), node_id, parent->GetOpDesc()->GetName().c_str(), parent_id);
  648. return FAILED;
  649. }
  650. new_cluster->in_clu_.insert(iter->second->index_);
  651. iter->second->out_clu_.insert(temp_index);
  652. }
  653. }
  654. }
  655. graph_info_.node_2_cluster_[node] = new_cluster;
  656. graph_info_.clusters_[temp_index] = new_cluster;
  657. GELOGD("Node name is %s, engine is %s, cluster index is %zu, stream label is %s", node->GetName().c_str(),
  658. new_cluster->engine_name_.c_str(), new_cluster->index_, new_cluster->stream_label_.c_str());
  659. temp_index++;
  660. }
  661. GELOGD("Initialize ends.");
  662. return SUCCESS;
  663. }
  664. Status ge::GraphPartitioner::AddPartitionsToGraphNode(vector<ge::SubGraphInfoPtr> &output_subgraphs,
  665. ge::ComputeGraphPtr compute_graph) {
  666. const std::string &input_subgraph_name = "inputNodesSubGraph";
  667. string session_graph_id;
  668. if (!AttrUtils::GetStr(*compute_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) {
  669. GELOGW("Get graph session_graph_id attr failed.");
  670. return INTERNAL_ERROR;
  671. }
  672. // the output_subgraphs have topological order
  673. for (const auto &sub_graph : graph_info_.rank_2_partitions_) {
  674. if (graph_info_.partitions_.find(sub_graph) == graph_info_.partitions_.end()) {
  675. REPORT_INNER_ERROR("E19999", "partition is null, subgraph:%s", sub_graph->GetName().c_str());
  676. GELOGE(GE_GRAPH_EMPTY_PARTITION, "[Check][Param] partition is null, subgraph:%s", sub_graph->GetName().c_str());
  677. return FAILED;
  678. }
  679. auto &engine_name = graph_info_.partitions_.at(sub_graph);
  680. (void)AttrUtils::SetStr(sub_graph, ATTR_NAME_PARENT_GRAPH_NAME, compute_graph->GetName());
  681. (void)sub_graph->SetExtAttr("part_src_graph", compute_graph);
  682. GELOGD("set attr success. subgraph(%s) with parent graph(%s)", sub_graph->GetName().c_str(),
  683. compute_graph->GetName().c_str());
  684. GE_DUMP(sub_graph, sub_graph->GetName() + "_" + mode_2_str_[graph_info_.mode_]);
  685. if (!session_graph_id.empty()) {
  686. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(sub_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id),
  687. GELOGW("SetStr ATTR_NAME_SESSION_GRAPH_ID failed");)
  688. }
  689. // flush parent node of subgraph
  690. sub_graph->SetParentNode(compute_graph->GetParentNode());
  691. auto sgi = MakeShared<SubGraphInfo>();
  692. if (sgi == nullptr) {
  693. REPORT_CALL_ERROR("E19999", "allocate memory for SubGraphInfo failed.");
  694. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Allocate][Memory] for SubGraphInfo failed.");
  695. return FAILED;
  696. }
  697. // set engine name
  698. sgi->SetEngineName(engine_name);
  699. // set stream label
  700. string sub_graph_stream;
  701. if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) {
  702. sgi->SetStreamLabel(sub_graph_stream);
  703. }
  704. /// for now inputFlag is the same before and after partition. It should
  705. /// be changed according to the real partition
  706. std::vector<bool> sub_graph_input(graph_info_.input_size_, true);
  707. std::vector<bool> sub_graph_output(graph_info_.output_size_, true);
  708. sgi->SetSubGraph(sub_graph);
  709. sgi->SetOutputFlag(sub_graph_output);
  710. sgi->SetInputFlag(sub_graph_input);
  711. sgi->SetOutputContext(graph_info_.output_name_);
  712. AddEndPldInformationToSubGraphInfo(sgi);
  713. GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s", engine_name.c_str(),
  714. sub_graph->GetName().c_str(), sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str());
  715. if (engine_name != input_subgraph_name) { // do not add Data subGraph into SubGraphInfo
  716. output_subgraphs.push_back(sgi);
  717. } else {
  718. graph_2_input_subgraph_[compute_graph] = sgi;
  719. }
  720. }
  721. return SUCCESS;
  722. }
  723. // check if two clusters can merge
  724. bool ge::GraphPartitioner::IsMergeable(size_t parent_cluster, size_t child_cluster, size_t upper_bound) {
  725. if ((graph_info_.clusters_[parent_cluster] == nullptr) || (graph_info_.clusters_[parent_cluster]->nodes_.empty()) ||
  726. (graph_info_.clusters_[child_cluster] == nullptr) || (graph_info_.clusters_[child_cluster]->nodes_.empty())) {
  727. return false;
  728. }
  729. // Check if parent_cluster,child_cluster has same engine or stream label
  730. if ((graph_info_.clusters_[parent_cluster]->engine_name_ != graph_info_.clusters_[child_cluster]->engine_name_) ||
  731. (graph_info_.clusters_[parent_cluster]->stream_label_ != graph_info_.clusters_[child_cluster]->stream_label_)) {
  732. GELOGD("Parent cluster %zu engine %s stream label %s, child cluster %zu engine %s stream label %s can not merge",
  733. parent_cluster, graph_info_.clusters_[parent_cluster]->engine_name_.c_str(),
  734. graph_info_.clusters_[parent_cluster]->stream_label_.c_str(), child_cluster,
  735. graph_info_.clusters_[child_cluster]->engine_name_.c_str(),
  736. graph_info_.clusters_[child_cluster]->stream_label_.c_str());
  737. return false;
  738. }
  739. // Check if parent_cluster,child_cluster is reachable
  740. RemoveEdge(parent_cluster, child_cluster);
  741. // Check if there is a path between parent and child, if return true, can not merge
  742. if (HasSecondPath(parent_cluster, child_cluster, upper_bound)) {
  743. GELOGD("Find second path from %zu to %zu, upper bound is %zu", parent_cluster, child_cluster, upper_bound);
  744. InsertEdge(parent_cluster, child_cluster);
  745. return false;
  746. }
  747. InsertEdge(parent_cluster, child_cluster);
  748. return true;
  749. }
  750. void ge::GraphPartitioner::MergeTwoClusters(size_t parent_cluster, size_t &child_cluster) {
  751. // check which index is bigger
  752. size_t big_cluster, small_cluster;
  753. size_t child_cluster_original = child_cluster;
  754. if (parent_cluster > child_cluster) {
  755. small_cluster = child_cluster;
  756. big_cluster = parent_cluster;
  757. } else {
  758. big_cluster = child_cluster;
  759. small_cluster = parent_cluster;
  760. // flush child_cluster, because it has been modified
  761. child_cluster = small_cluster;
  762. }
  763. // update node_2_cluster_ map
  764. for (auto &node : graph_info_.clusters_[big_cluster]->nodes_) {
  765. graph_info_.node_2_cluster_[node] = graph_info_.clusters_[small_cluster];
  766. }
  767. // merge nodes
  768. graph_info_.clusters_[small_cluster]->nodes_.splice(graph_info_.clusters_[small_cluster]->nodes_.end(),
  769. graph_info_.clusters_[big_cluster]->nodes_);
  770. // merge all input & output to small cluster
  771. graph_info_.clusters_[small_cluster]->in_clu_.insert(graph_info_.clusters_[big_cluster]->in_clu_.begin(),
  772. graph_info_.clusters_[big_cluster]->in_clu_.end());
  773. graph_info_.clusters_[small_cluster]->out_clu_.insert(graph_info_.clusters_[big_cluster]->out_clu_.begin(),
  774. graph_info_.clusters_[big_cluster]->out_clu_.end());
  775. // remove child_cluster's out parent_cluster's in between child_cluster and parent_cluster
  776. RemoveEdge(parent_cluster, child_cluster_original);
  777. // update in/out of the cluster with bigger index
  778. for (auto in_clu : graph_info_.clusters_[big_cluster]->in_clu_) {
  779. graph_info_.clusters_[in_clu]->out_clu_.insert(small_cluster);
  780. graph_info_.clusters_[in_clu]->out_clu_.erase(big_cluster);
  781. }
  782. for (auto out_clu : graph_info_.clusters_[big_cluster]->out_clu_) {
  783. graph_info_.clusters_[out_clu]->in_clu_.insert(small_cluster);
  784. graph_info_.clusters_[out_clu]->in_clu_.erase(big_cluster);
  785. }
  786. graph_info_.clusters_[big_cluster] = graph_info_.clusters_[small_cluster];
  787. }
  788. void ge::GraphPartitioner::RemoveEdge(size_t parent_cluster, size_t child_cluster) {
  789. graph_info_.clusters_[child_cluster]->in_clu_.erase(parent_cluster);
  790. graph_info_.clusters_[parent_cluster]->out_clu_.erase(child_cluster);
  791. }
  792. void ge::GraphPartitioner::InsertEdge(size_t from, size_t to) {
  793. if (from == to) {
  794. return;
  795. }
  796. if (!graph_info_.clusters_[from]->out_clu_.insert(to).second) {
  797. // edge has already exists
  798. return;
  799. }
  800. graph_info_.clusters_[to]->in_clu_.insert(from);
  801. }
  802. void ge::GraphPartitioner::MarkClusters() {
  803. GELOGI("MarkClusters starts. cluster size is %zu", graph_info_.clusters_.size());
  804. size_t cluster_size = graph_info_.clusters_.size();
  805. for (size_t child_cluster = 0; child_cluster < cluster_size; child_cluster++) {
  806. auto found_child_cluster = graph_info_.clusters_[child_cluster];
  807. if (found_child_cluster == nullptr) {
  808. GELOGW("can not found child_cluster is %zu", child_cluster);
  809. continue;
  810. }
  811. auto copy_parents_clusters = found_child_cluster->in_clu_;
  812. vector<size_t> ordered_cluster;
  813. for (const auto &parent_cluster : copy_parents_clusters) {
  814. ordered_cluster.emplace_back(parent_cluster);
  815. }
  816. // sort cluster according to it's output amount
  817. auto comp_func = [this](const size_t &parent_cluster1, const size_t &parent_cluster2) -> bool {
  818. return graph_info_.clusters_[parent_cluster1]->out_clu_.size() <
  819. graph_info_.clusters_[parent_cluster2]->out_clu_.size();
  820. };
  821. std::sort(ordered_cluster.begin(), ordered_cluster.end(), comp_func);
  822. auto child_merged = child_cluster;
  823. for (const auto &parent_cluster : ordered_cluster) {
  824. if (IsMergeable(parent_cluster, child_merged, child_cluster)) {
  825. MergeTwoClusters(parent_cluster, child_merged);
  826. GELOGD("Merging cluster %zu and %zu to %zu", parent_cluster, child_cluster, child_merged);
  827. }
  828. }
  829. }
  830. GELOGD("MarkClusters ends.");
  831. }
  832. Status ge::GraphPartitioner::SplitSubGraphs(ge::ComputeGraphPtr compute_graph) {
  833. GELOGD("SplitSubGraphs starts.");
  834. if (compute_graph == nullptr) {
  835. REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid");
  836. GELOGE(FAILED, "[Check][Param] parameter ptr is null.");
  837. return FAILED;
  838. }
  839. // Create graphs for all clusters
  840. std::unordered_set<ClusterPtr> cluster_set;
  841. // add pld&end
  842. for (auto &node : compute_graph->GetDirectNode()) {
  843. GELOGD("Node name is %s.", node->GetName().c_str());
  844. auto child_cluster = graph_info_.node_2_cluster_[node];
  845. ge::ComputeGraphPtr corresponding_graph;
  846. // unordered_set's insert returns a pair, second of pair is bool
  847. if (!cluster_set.insert(child_cluster).second) {
  848. GELOGD("Old sub graph, child_cluster is %zu", child_cluster->index_);
  849. corresponding_graph = graph_info_.cluster_2_partition_.at(child_cluster);
  850. } else {
  851. std::string graph_name = "new_sub_graph" + std::to_string(graph_info_.partitions_.size());
  852. ComputeGraphPtr new_sub_graph = MakeShared<ge::ComputeGraph>(graph_name);
  853. if (new_sub_graph == nullptr) {
  854. REPORT_CALL_ERROR("E19999", "allocate memory for ge::ComputeGraph failed.");
  855. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Allocate][Memory] for ge::ComputeGraph failed.");
  856. return FAILED;
  857. }
  858. AddNewGraphToPartition(new_sub_graph, child_cluster->engine_name_);
  859. corresponding_graph = new_sub_graph;
  860. graph_info_.cluster_2_partition_[child_cluster] = corresponding_graph;
  861. GELOGD("New sub graph, name is %s", graph_name.c_str());
  862. }
  863. // build node to corresponding node map
  864. NodePtr corresponding_node = corresponding_graph->AddNode(node->GetOpDesc());
  865. if (corresponding_node == nullptr) {
  866. REPORT_CALL_ERROR("E19999", "add node:%s in graph:%s failed",
  867. node->GetName().c_str(), corresponding_graph->GetName().c_str());
  868. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Add][Node] %s in graph:%s failed.",
  869. node->GetName().c_str(), corresponding_graph->GetName().c_str());
  870. return FAILED;
  871. }
  872. graph_info_.corresponding_node_in_partitions_[node] = corresponding_node;
  873. GE_CHK_STATUS_RET(corresponding_node->SetOwnerComputeGraph(corresponding_graph))
  874. for (const auto &in_anchor : node->GetAllInAnchors()) {
  875. GELOGD("In anchor index is %d", AnchorUtils::GetIdx(in_anchor));
  876. for (auto &peer_out_anchor : in_anchor->GetPeerAnchors()) {
  877. GELOGD("Peer out anchor index is %d", AnchorUtils::GetIdx(peer_out_anchor));
  878. // Normally, all nodes have a copy in corresponding_node_in_partitions_, so function at can not be exception
  879. auto iter = graph_info_.corresponding_node_in_partitions_.find(peer_out_anchor->GetOwnerNode());
  880. if (iter == graph_info_.corresponding_node_in_partitions_.end()) {
  881. REPORT_INNER_ERROR("E19999", "node[%s]id[%ld]'s parent_node[%s]id[%ld]"
  882. "should make corresponding in advance",
  883. node->GetOpDesc()->GetName().c_str(), node->GetOpDesc()->GetId(),
  884. peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(),
  885. peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetId());
  886. GELOGE(GRAPH_FAILED, "[Check][Param] node[%s]id[%ld]'s parent_node[%s]id[%ld]"
  887. "should make corresponding in advance",
  888. node->GetOpDesc()->GetName().c_str(), node->GetOpDesc()->GetId(),
  889. peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(),
  890. peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetId());
  891. return GRAPH_FAILED;
  892. }
  893. auto parent_node = iter->second;
  894. GE_CHECK_NOTNULL(parent_node);
  895. GELOGD("Parent node name is %s", parent_node->GetName().c_str());
  896. // add edge
  897. auto src_anchor = parent_node->GetOutAnchor(AnchorUtils::GetIdx(peer_out_anchor));
  898. auto dst_anchor = corresponding_node->GetInAnchor(AnchorUtils::GetIdx(in_anchor));
  899. // if child and parent's cluster is not same, add plc and end
  900. auto parent_cluster = graph_info_.node_2_cluster_[peer_out_anchor->GetOwnerNode()];
  901. if (parent_cluster != child_cluster) {
  902. GELOGD("Parent cluster is %zu, child_cluster is %zu", parent_cluster->index_, child_cluster->index_);
  903. if (AddPlaceHolderEnd(peer_out_anchor, in_anchor) != ge::SUCCESS) {
  904. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED,
  905. "[Call][AddPlaceHolderEnd] failed, out_anchor:%s index:%d, in_anchor:%s index:%d.",
  906. peer_out_anchor->GetOwnerNode()->GetName().c_str(), AnchorUtils::GetIdx(peer_out_anchor),
  907. in_anchor->GetOwnerNode()->GetName().c_str(), AnchorUtils::GetIdx(in_anchor));
  908. return FAILED;
  909. }
  910. } else { // parent and child in the same cluster, add edge
  911. GELOGD("AddEdge from parent cluster %zu to child %zu", parent_cluster->index_, child_cluster->index_);
  912. if (GraphUtils::AddEdge(src_anchor, dst_anchor) != GRAPH_SUCCESS) {
  913. REPORT_CALL_ERROR("E19999", "add edge from %s to %s failed",
  914. peer_out_anchor->GetOwnerNode()->GetName().c_str(),
  915. in_anchor->GetOwnerNode()->GetName().c_str());
  916. GELOGE(GRAPH_FAILED, "[Add][Edge] from %s to %s failed", peer_out_anchor->GetOwnerNode()->GetName().c_str(),
  917. in_anchor->GetOwnerNode()->GetName().c_str());
  918. return FAILED;
  919. }
  920. }
  921. }
  922. }
  923. }
  924. GELOGD("SplitSubGraphs ends.");
  925. return SUCCESS;
  926. }
  927. /// before calling this function, the direct path between src and dst are already removed.
  928. /// return true if a second path is found
  929. bool ge::GraphPartitioner::HasSecondPath(size_t src, size_t dst, size_t upper_bound) {
  930. if (graph_info_.clusters_.at(src)->out_clu_.empty() || graph_info_.clusters_.at(dst)->in_clu_.empty()) {
  931. return false;
  932. }
  933. /// Avoid recursion since stack space might be limited.
  934. /// We instead keep a stack of nodes to visit.
  935. std::vector<size_t> temp_stack;
  936. std::set<size_t> visited;
  937. temp_stack.push_back(src);
  938. while (!temp_stack.empty()) {
  939. size_t cluster = temp_stack.back();
  940. temp_stack.pop_back();
  941. ClusterPtr cur_cluster = graph_info_.clusters_[cluster];
  942. if (!visited.insert(cluster).second) {
  943. continue;
  944. }
  945. for (auto out : cur_cluster->out_clu_) {
  946. if (out == dst) {
  947. return true; // There is cycle
  948. }
  949. if (out < upper_bound) {
  950. temp_stack.push_back(out);
  951. }
  952. }
  953. }
  954. return false;
  955. }
  956. Status ge::GraphPartitioner::Partition(ge::ComputeGraphPtr compute_graph, Mode mode) {
  957. if (compute_graph->TopologicalSorting() != SUCCESS) {
  958. REPORT_CALL_ERROR("E19999", "TopologicalSorting for graph:%s failed",
  959. compute_graph->GetName().c_str());
  960. GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[Call][TopologicalSorting] for subGraph:%s failed",
  961. compute_graph->GetName().c_str());
  962. return FAILED;
  963. }
  964. graph_info_.engine_placer_.SetComputeGraph(compute_graph);
  965. if (graph_info_.engine_placer_.Run(false) != SUCCESS) {
  966. GELOGE(FAILED, "[Call][Run] Engine placer run failed, graph:%s.", compute_graph->GetName().c_str());
  967. return FAILED;
  968. }
  969. if (mode == GraphPartitioner::kCompositeEnginePartitioning) {
  970. if (graph_info_.engine_placer_.AssignCompositeEngine() != SUCCESS) {
  971. GELOGE(FAILED, "[Partition][SubGraph] Assign composite engine for graph %s failed",
  972. compute_graph->GetName().c_str());
  973. return FAILED;
  974. }
  975. }
  976. ClearAllPartitionData();
  977. auto real_ret = SUCCESS;
  978. auto ret = PartitionSubGraph(compute_graph, mode);
  979. if (ret != SUCCESS) {
  980. GELOGE(ret, "[Partition][SubGraph] Failed, ret:%d", ret);
  981. real_ret = ret;
  982. }
  983. GE_CHECK_NOTNULL(compute_graph);
  984. // partition sub graph
  985. for (const auto &sub_graph : compute_graph->GetAllSubgraphs()) {
  986. ret = PartitionSubGraph(sub_graph, mode);
  987. if (ret != SUCCESS) {
  988. GELOGE(ret, "[Partition][SubGraph] Failed, ret:%d", ret);
  989. real_ret = ret;
  990. }
  991. }
  992. if (real_ret != SUCCESS) {
  993. auto root_graph = ge::GraphUtils::FindRootGraph(compute_graph);
  994. GE_CHECK_NOTNULL(root_graph);
  995. (void)Analyzer::GetInstance()->SaveAnalyzerDataToFile(root_graph->GetSessionID(),
  996. root_graph->GetGraphID());
  997. }
  998. return real_ret;
  999. }
  1000. Status ge::GraphPartitioner::PartitionSubGraph(ge::ComputeGraphPtr compute_graph, Mode mode) {
  1001. if (compute_graph == nullptr) {
  1002. REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid.");
  1003. GELOGE(GE_GRAPH_NULL_INPUT, "[Check][Param] compute_graph is nullptr.");
  1004. return FAILED;
  1005. }
  1006. // clear graph_info
  1007. graph_info_.ClearAllData(mode);
  1008. graph_info_.output_name_ = compute_graph->GetOutput();
  1009. graph_info_.output_size_ = compute_graph->GetOutputSize();
  1010. graph_info_.input_size_ = compute_graph->GetInputSize();
  1011. if (graph_info_.output_size_ == 0) {
  1012. REPORT_INNER_ERROR("E19999", "the output size of graph:%s is 0, check invalid.",
  1013. compute_graph->GetName().c_str());
  1014. GELOGE(GE_GRAPH_NULL_INPUT, "[Check][Param] The output size:0 of graph:%s need to be greater than 0.",
  1015. compute_graph->GetName().c_str());
  1016. return FAILED;
  1017. }
  1018. GELOGI("Graph Partition starts, graph nodes size is %zu", compute_graph->GetDirectNodesSize());
  1019. GE_TIMESTAMP_START(PartitionSubGraphInitialize);
  1020. if (Initialize(compute_graph) != SUCCESS) {
  1021. GELOGE(GE_GRAPH_INIT_FAILED, "[Call][Initialize] for graph:%s failed", compute_graph->GetName().c_str());
  1022. return FAILED;
  1023. }
  1024. GE_TIMESTAMP_END(PartitionSubGraphInitialize, "GraphPartitioner::PartitionInitialize");
  1025. GE_TIMESTAMP_START(PartitionSubGraphMarkClusters);
  1026. MarkClusters();
  1027. GE_TIMESTAMP_END(PartitionSubGraphMarkClusters, "GraphPartitioner::PartitionMarkClusters");
  1028. GE_TIMESTAMP_START(PartitionSubGraphSplitSubGraphs);
  1029. if (SplitSubGraphs(compute_graph) != SUCCESS) {
  1030. GELOGE(FAILED, "[Split][SubGraphs] for graph:%s failed", compute_graph->GetName().c_str());
  1031. return FAILED;
  1032. }
  1033. GE_TIMESTAMP_END(PartitionSubGraphSplitSubGraphs, "GraphPartitioner::PartitionSplitSubGraphs");
  1034. GE_TIMESTAMP_START(PartitionSubGraphSortSubGraphs);
  1035. if (SortSubGraphs(compute_graph) != ge::SUCCESS) {
  1036. GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[Sort][SubGraphs] for graph:%s failed.",
  1037. compute_graph->GetName().c_str());
  1038. return ge::FAILED;
  1039. }
  1040. GE_TIMESTAMP_END(PartitionSubGraphSortSubGraphs, "GraphPartitioner::PartitionSortSubGraphs");
  1041. GE_TIMESTAMP_START(PartitionSubGraphAddPartitionsToGraphNode);
  1042. vector<ge::SubGraphInfoPtr> output_subgraphs;
  1043. if (AddPartitionsToGraphNode(output_subgraphs, compute_graph) != ge::SUCCESS) {
  1044. GELOGE(GE_GRAPH_EMPTY_PARTITION, "[Add][Partitions] To GraphNode failed, graph:%s.",
  1045. compute_graph->GetName().c_str());
  1046. return ge::FAILED;
  1047. }
  1048. GE_TIMESTAMP_END(PartitionSubGraphAddPartitionsToGraphNode, "GraphPartitioner::PartitionAddPartitionsToGraphNode");
  1049. GELOGI("Graph Partition ends. Adding partitions to SubGraphInfo, got %zu sub graphs", output_subgraphs.size());
  1050. graph_info_.mode_ = kMerging;
  1051. // do not care over flow
  1052. partition_times_++;
  1053. graph_2_graph_partition_info_[compute_graph] = graph_info_;
  1054. graph_2_subgraph_list_[compute_graph] = output_subgraphs;
  1055. return SUCCESS;
  1056. }
  1057. // all the inputs are the nodes and anchors in the original graph
  1058. Status ge::GraphPartitioner::AddPlaceHolderEnd(const AnchorPtr &out_anchor, const AnchorPtr &in_anchor) {
  1059. if ((out_anchor == nullptr) || (in_anchor == nullptr)) {
  1060. REPORT_INNER_ERROR("E19999", "Param out_anchor or in_anchor is nullptr, check invalid.");
  1061. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] out_anchor or in_anchor is nullptr.");
  1062. return FAILED;
  1063. }
  1064. // nodes in original graph
  1065. const auto &src_node = out_anchor->GetOwnerNode();
  1066. const auto &dst_node = in_anchor->GetOwnerNode();
  1067. if ((src_node == nullptr) || (dst_node == nullptr)) {
  1068. REPORT_INNER_ERROR("E19999", "in_anchor'node or out_anchor'node is nullptr. check invalid.");
  1069. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] src_node or dst_node is nullptr.");
  1070. return FAILED;
  1071. }
  1072. // All nodes have a copy in corresponding_node_in_partitions_, so function at can not be execption
  1073. auto src_anchor =
  1074. graph_info_.corresponding_node_in_partitions_.at(src_node)->GetOutAnchor(AnchorUtils::GetIdx(out_anchor));
  1075. auto dst_anchor =
  1076. graph_info_.corresponding_node_in_partitions_.at(dst_node)->GetInAnchor(AnchorUtils::GetIdx(in_anchor));
  1077. if ((src_anchor == nullptr) || (dst_anchor == nullptr)) {
  1078. REPORT_INNER_ERROR("E19999", "src_anchor(index:%d) or dst_anchor(index:%d) is nullptr.",
  1079. AnchorUtils::GetIdx(out_anchor), AnchorUtils::GetIdx(in_anchor));
  1080. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] src_anchor(index:%d) or dst_anchor(index:%d) is nullptr.",
  1081. AnchorUtils::GetIdx(out_anchor), AnchorUtils::GetIdx(in_anchor));
  1082. return FAILED;
  1083. }
  1084. // anchors in subGraph
  1085. const ComputeGraphPtr &src_subgraph = src_anchor->GetOwnerNode()->GetOwnerComputeGraph();
  1086. const ComputeGraphPtr &dst_subgraph = dst_anchor->GetOwnerNode()->GetOwnerComputeGraph();
  1087. // add end and pld node
  1088. auto ret = AddPlaceHolderEndInSrcDstGraph(src_anchor, dst_anchor, dst_subgraph, src_subgraph);
  1089. if (ret != SUCCESS) {
  1090. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Call][AddPlaceHolderEndInSrcDstGraph] failed, ret:%d.", ret);
  1091. return ret;
  1092. }
  1093. return SUCCESS;
  1094. }
  1095. Status ge::GraphPartitioner::SortSubGraphs(const ge::ComputeGraphPtr &compute_graph) {
  1096. uint32_t rank = kRankOne; // rank 0 for data graph
  1097. ComputeGraphPtr new_input_nodes_sub_graph = MakeShared<ComputeGraph>("inputNodeGraph");
  1098. if ((new_input_nodes_sub_graph == nullptr) || (compute_graph == nullptr)) {
  1099. REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr or Allocate Memory for ComputeGraph failed.");
  1100. GELOGE(FAILED, "[Check][Param] new_input_nodes_sub_graph or compute_graph is nullptr.");
  1101. return FAILED;
  1102. }
  1103. for (const auto &node : compute_graph->GetDirectNode()) {
  1104. // All nodes in original graph have a copy in corresponding_node_in_partitions_, so it can not be null
  1105. auto sub_graph = graph_info_.corresponding_node_in_partitions_.at(node)->GetOwnerComputeGraph();
  1106. if ((graph_info_.partitions_2_rank_.find(sub_graph) == graph_info_.partitions_2_rank_.end()) &&
  1107. (graph_info_.partitions_[sub_graph] != kEngineDefaultData)) {
  1108. graph_info_.partitions_2_rank_[sub_graph] = rank;
  1109. graph_info_.rank_2_partitions_.push_back(sub_graph);
  1110. rank++;
  1111. } else if (graph_info_.partitions_[sub_graph] == kEngineDefaultData) { // merge data graph
  1112. if (PutInputNodesInSubGraph(sub_graph, new_input_nodes_sub_graph) != SUCCESS) {
  1113. GELOGE(FAILED, "[Call][putInputNodesInSubGraph] failed.");
  1114. return FAILED;
  1115. }
  1116. auto to_be_del = graph_info_.partitions_.find(sub_graph);
  1117. graph_info_.partitions_.erase(to_be_del);
  1118. }
  1119. }
  1120. if (!new_input_nodes_sub_graph->GetDirectNode().empty()) {
  1121. graph_info_.rank_2_partitions_.insert(graph_info_.rank_2_partitions_.begin(), new_input_nodes_sub_graph);
  1122. graph_info_.partitions_2_rank_[new_input_nodes_sub_graph] = 0;
  1123. AddNewGraphToPartition(new_input_nodes_sub_graph, "inputNodesSubGraph");
  1124. }
  1125. // reinit rank
  1126. rank = kRankZero;
  1127. for (const auto &it : graph_info_.rank_2_partitions_) {
  1128. // rename subGraph based on rank
  1129. if (it != nullptr) {
  1130. // rename subGraph based on rank
  1131. string graph_name =
  1132. "partition" + std::to_string(partition_times_) + "_rank" + std::to_string(rank) + "_" + it->GetName();
  1133. it->SetName(graph_name);
  1134. }
  1135. rank++;
  1136. }
  1137. return SUCCESS;
  1138. }
  1139. AnchorPtr ge::GraphPartitioner::GetEndInAnchor(const AnchorPtr &src_anchor, const NodePtr &end_node) {
  1140. if ((src_anchor == nullptr) || (end_node == nullptr)) {
  1141. REPORT_INNER_ERROR("E19999", "Param src_anchor or end_node is nullptr, check invalid.");
  1142. GELOGE(FAILED, "[Check][Param] parameter src_anchor or end_node is nullptr.");
  1143. return nullptr;
  1144. }
  1145. AnchorPtr end_in_anchor;
  1146. if (Anchor::DynamicAnchorCast<OutDataAnchor>(src_anchor) != nullptr) {
  1147. end_in_anchor = end_node->GetInDataAnchor(0);
  1148. } else {
  1149. end_in_anchor = end_node->GetInControlAnchor();
  1150. }
  1151. return end_in_anchor;
  1152. }
  1153. AnchorPtr ge::GraphPartitioner::GetPldOutAnchor(const NodePtr &pld_node, const AnchorPtr &dst_anchor) {
  1154. if ((pld_node == nullptr) || (dst_anchor == nullptr)) {
  1155. REPORT_INNER_ERROR("E19999", "Param pld_node or dst_anchor is nullptr, check invalid.");
  1156. GELOGE(FAILED, "[Check][Param] parameter pld_node or dst_anchor is nullptr.");
  1157. return nullptr;
  1158. }
  1159. AnchorPtr pld_out_anchor;
  1160. if (Anchor::DynamicAnchorCast<InDataAnchor>(dst_anchor) != nullptr) {
  1161. pld_out_anchor = pld_node->GetOutDataAnchor(0);
  1162. } else {
  1163. pld_out_anchor = pld_node->GetOutControlAnchor();
  1164. }
  1165. return pld_out_anchor;
  1166. }
  1167. void ge::GraphPartitioner::AddEndPldInformationToSubGraphInfo(ge::SubGraphInfoPtr &subgraph_info) {
  1168. if (subgraph_info == nullptr) {
  1169. GELOGE(FAILED, "[Check][Param] parameter subgraph_info is nullptr.");
  1170. return;
  1171. }
  1172. auto subgraph = subgraph_info->GetSubGraph();
  1173. GE_CHECK_NOTNULL_JUST_RETURN(subgraph);
  1174. NodetoNodeMap end_map;
  1175. NodetoNodeMap pld_map;
  1176. for (const auto &node : subgraph->GetDirectNode()) {
  1177. if (node->GetType() == kEndType) {
  1178. end_map[node] = graph_info_.end_2_pld_.at(node);
  1179. }
  1180. if (node->GetType() == kPlaceHolderType) {
  1181. pld_map[node] = graph_info_.pld_2_end_.at(node);
  1182. }
  1183. }
  1184. subgraph_info->SetEnd2PldMap(end_map);
  1185. subgraph_info->SetPld2EndMap(pld_map);
  1186. }
  1187. const Graph2SubGraphInfoList &ge::GraphPartitioner::GetSubGraphMap() { return graph_2_subgraph_list_; }
  1188. void ge::GraphPartitioner::ClearAllPartitionData() {
  1189. graph_2_graph_partition_info_.clear();
  1190. graph_2_subgraph_list_.clear();
  1191. graph_2_input_subgraph_.clear();
  1192. GELOGD("Clear all partition data success.");
  1193. return;
  1194. }
  1195. const NodeEngineMap &GraphPartitioner::GetNodeEngineMap() const {
  1196. return graph_info_.engine_placer_.GetNodeEngineMap(graph_info_.mode_ == kCompositeEnginePartitioning);
  1197. }
  1198. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示