You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

graph_partition.cc 64 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "graph/partition/graph_partition.h"
  17. #include <algorithm>
  18. #include <memory>
  19. #include <string>
  20. #include <unordered_set>
  21. #include <vector>
  22. #include "analyzer/analyzer.h"
  23. #include "common/ge/ge_util.h"
  24. #include "common/op/ge_op_utils.h"
  25. #include "framework/common/types.h"
  26. #include "graph/debug/ge_attr_define.h"
  27. #include "graph/manager/graph_manager_utils.h"
  28. #include "graph/common/ge_call_wrapper.h"
  29. #include "graph/utils/graph_utils.h"
  30. #include "graph/utils/op_desc_utils.h"
  31. #include "graph/utils/type_utils.h"
  32. #include "init/gelib.h"
  33. #include "opskernel_manager/ops_kernel_manager.h"
  34. namespace {
  35. const char *const kEngineDefaultData = "ENGINE_DEFAULT_DATA";
  36. const char *const kEndType = "End";
  37. const char *const kPlaceHolderType = "PlaceHolder";
  38. const int kOneGraph = 1; // only one graph
  39. const int kRankOne = 1; // order of graph list is 0,1,2,3..., 1 means second order
  40. const int kRankZero = 0; // order of graph list is 0,1,2,3..., 0 means first order
  41. } // namespace
  42. namespace ge {
  43. Status ge::GraphPartitioner::CheckIfEnd2PldEmpty(ge::ComputeGraphPtr &output_merged_compute_graph) {
  44. // only one condition:no data node, one engine, there is only one graph + input graph
  45. if (graph_info_.partitions_.size() == kOneGraph) {
  46. auto partition = (*graph_info_.partitions_.begin());
  47. if (partition.first == nullptr) {
  48. REPORT_INNER_ERROR("E19999", "partition.first is nullptr, check invalid, engine name is %s",
  49. partition.second.c_str());
  50. GELOGE(GE_GRAPH_EMPTY_PARTITION, "[Check][Param] partition.first is null, engine name is %s",
  51. partition.second.c_str());
  52. return FAILED;
  53. }
  54. output_merged_compute_graph = partition.first;
  55. } else { // if placeholder to end map is empty, it should be an exception condition
  56. REPORT_INNER_ERROR("E19999", "partitions size:%zu is not 1, check invalid.", graph_info_.partitions_.size());
  57. GELOGE(GE_GRAPH_EMPTY_PARTITION,
  58. "[Check][Param] placeholder to end map is empty, partitions size:%zu is not 1.",
  59. graph_info_.partitions_.size());
  60. return FAILED;
  61. }
  62. return SUCCESS;
  63. }
  64. Status ge::GraphPartitioner::MergeAllSubGraph(ge::ComputeGraphPtr &output_merged_compute_graph,
  65. const std::vector<SubGraphInfoPtr> &sub_graph_list) {
  66. for (size_t rank = 0; rank < graph_info_.rank_2_partitions_.size(); rank++) {
  67. string temp_stream;
  68. // sub_graph_list index is one ahead of rank_2_partitions_list index
  69. if (rank > 0) {
  70. temp_stream = sub_graph_list[rank - 1]->GetStreamLabel();
  71. }
  72. for (const auto &node : graph_info_.rank_2_partitions_[rank]->GetDirectNode()) {
  73. if (node == nullptr) {
  74. continue;
  75. }
  76. if ((node->GetType() == kEndType) || (node->GetType() == kPlaceHolderType)) {
  77. continue;
  78. }
  79. if (!temp_stream.empty() && !AttrUtils::HasAttr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL)) {
  80. (void)AttrUtils::SetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, temp_stream);
  81. }
  82. if (node->SetOwnerComputeGraph(output_merged_compute_graph) != GRAPH_SUCCESS) {
  83. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph for node:%s failed.", node->GetName().c_str());
  84. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Set][OwnerComputeGraph] failed, node %s", node->GetName().c_str());
  85. return FAILED;
  86. }
  87. (void)output_merged_compute_graph->AddNode(node);
  88. }
  89. }
  90. // get session graph id from subgraph
  91. SetMergedGraphId(output_merged_compute_graph);
  92. return SUCCESS;
  93. }
  94. void ge::GraphPartitioner::SetMergedGraphId(ge::ComputeGraphPtr &output_merged_compute_graph) {
  95. string session_graph_id;
  96. // get session graph id from subgraph
  97. if (graph_info_.rank_2_partitions_.empty() ||
  98. !AttrUtils::GetStr(*(graph_info_.rank_2_partitions_[0]), ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) {
  99. GELOGW("Get graph session_graph_id attr failed.");
  100. }
  101. // set session graph id into merged subgraph
  102. if (!session_graph_id.empty()) {
  103. GELOGI("Set session graph id %s in merged compute graph", session_graph_id.c_str());
  104. // private function, promise output_merged_compute_graph not null
  105. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(*output_merged_compute_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id),
  106. GELOGW("SetStr ATTR_NAME_SESSION_GRAPH_ID failed");)
  107. }
  108. }
  109. Status ge::GraphPartitioner::RemoveNodeAndEdgeBetweenEndPld(ge::ComputeGraphPtr &output_merged_compute_graph,
  110. const std::vector<SubGraphInfoPtr> &sub_graph_list) {
  111. if ((output_merged_compute_graph == nullptr) ||
  112. (MergeAllSubGraph(output_merged_compute_graph, sub_graph_list) != SUCCESS)) {
  113. REPORT_INNER_ERROR("E19999", "output_merged_compute_graph is nullptr or Call MergeAllSubGraph failed.");
  114. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Merge][AllSubGraph] failed.");
  115. return FAILED;
  116. }
  117. for (const auto &it : graph_info_.index_2_end_) {
  118. auto &end = it.second;
  119. auto &pld = graph_info_.end_2_pld_[it.second];
  120. if ((end != nullptr) && (pld != nullptr) && (end->GetInDataAnchor(0) != nullptr) &&
  121. (pld->GetOutDataAnchor(0) != nullptr)) {
  122. AnchorPtr end_in_anchor = (end->GetInDataAnchor(0)->GetFirstPeerAnchor() == nullptr)
  123. ? Anchor::DynamicAnchorCast<Anchor>(end->GetInControlAnchor())
  124. : Anchor::DynamicAnchorCast<Anchor>(end->GetInDataAnchor(0));
  125. AnchorPtr pld_out_anchor = (pld->GetOutDataAnchor(0)->GetFirstPeerAnchor() == nullptr)
  126. ? Anchor::DynamicAnchorCast<Anchor>(pld->GetOutControlAnchor())
  127. : Anchor::DynamicAnchorCast<Anchor>(pld->GetOutDataAnchor(0));
  128. auto src_anchor = end_in_anchor->GetFirstPeerAnchor(); // src_anchor should be only 1
  129. if (GraphUtils::RemoveEdge(src_anchor, end_in_anchor) != GRAPH_SUCCESS) {
  130. REPORT_CALL_ERROR("E19999", "RemoveEdge between %s and %s failed",
  131. src_anchor->GetOwnerNode()->GetName().c_str(),
  132. end_in_anchor->GetOwnerNode()->GetName().c_str());
  133. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Remove][Edge] between %s and %s failed. node_name:%s, graph_name:%s",
  134. src_anchor->GetOwnerNode()->GetName().c_str(), end_in_anchor->GetOwnerNode()->GetName().c_str(),
  135. end->GetName().c_str(), end->GetOwnerComputeGraph()->GetName().c_str());
  136. return FAILED;
  137. }
  138. GE_CHECK_NOTNULL(pld_out_anchor);
  139. for (const auto &peer_in_anchor : pld_out_anchor->GetPeerAnchors()) {
  140. if (GraphUtils::RemoveEdge(pld_out_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
  141. REPORT_CALL_ERROR("E19999", "RemoveEdge between %s and %s failed",
  142. pld_out_anchor->GetOwnerNode()->GetName().c_str(),
  143. peer_in_anchor->GetOwnerNode()->GetName().c_str());
  144. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Remove][Edge] between %s and %s failed. node_name:%s, graph_name:%s",
  145. pld_out_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str(),
  146. pld->GetName().c_str(), pld->GetOwnerComputeGraph()->GetName().c_str());
  147. return FAILED;
  148. }
  149. if (GraphUtils::AddEdge(src_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
  150. REPORT_CALL_ERROR("E19999", "AddEdge from %s to %s failed.",
  151. src_anchor->GetOwnerNode()->GetName().c_str(),
  152. peer_in_anchor->GetOwnerNode()->GetName().c_str());
  153. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Add][Edge] from %s to %s failed.",
  154. src_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str());
  155. return FAILED;
  156. }
  157. }
  158. } else {
  159. GELOGW("End or pld is nullptr or in data anchor of end is nullptr or out data anchor of pld is nullptr");
  160. }
  161. }
  162. return SUCCESS;
  163. }
  164. Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr &output_merged_compute_graph,
  165. const ge::ComputeGraphPtr &original_compute_graph) {
  166. Status real_ret = SUCCESS;
  167. auto ret = MergeSubGraph(output_merged_compute_graph, original_compute_graph);
  168. if (ret != SUCCESS) {
  169. // even though failed, ensure all op do finish check support
  170. real_ret = FAILED;
  171. GELOGE(ret, "[Merge][SubGraph] Failed, ret:%d", ret);
  172. }
  173. GE_CHECK_NOTNULL(original_compute_graph);
  174. output_merged_compute_graph->SetName(original_compute_graph->GetName());
  175. // partition sub graph
  176. for (const auto &sub_graph : original_compute_graph->GetAllSubgraphs()) {
  177. ComputeGraphPtr merged_sub_graph = nullptr;
  178. ret = MergeSubGraph(merged_sub_graph, sub_graph);
  179. if (ret != SUCCESS) {
  180. real_ret = FAILED;
  181. GELOGE(ret, "[Merge][SubGraph] Failed, ret:%d", ret);
  182. continue;
  183. }
  184. // this means subgraph added in optimize subgraph and without partitions, so just add to root graph
  185. if (merged_sub_graph == sub_graph) {
  186. GELOGI("Just add subgraph %s (parent node is %s) to root graph %s.", sub_graph->GetName().c_str(),
  187. sub_graph->GetParentNode()->GetName().c_str(), output_merged_compute_graph->GetName().c_str());
  188. sub_graph->SetParentGraph(sub_graph->GetParentNode()->GetOwnerComputeGraph());
  189. GE_IF_BOOL_EXEC(output_merged_compute_graph->AddSubgraph(sub_graph->GetName(), merged_sub_graph) != SUCCESS,
  190. return FAILED;)
  191. continue;
  192. }
  193. // add sub graph
  194. merged_sub_graph->SetName(sub_graph->GetName());
  195. merged_sub_graph->SetInputSize(sub_graph->GetInputSize());
  196. merged_sub_graph->SetOutputSize(sub_graph->GetOutputSize());
  197. auto parent_node = sub_graph->GetParentNode();
  198. GE_IF_BOOL_EXEC(parent_node == nullptr,
  199. REPORT_INNER_ERROR("E19999", "Parent node of graph:%s is nullptr.",
  200. sub_graph->GetName().c_str());
  201. GELOGE(FAILED, "[Check][Param] Parent node is null, graph name is %s",
  202. sub_graph->GetName().c_str());
  203. return FAILED;)
  204. auto original_graph = parent_node->GetOwnerComputeGraph();
  205. GE_IF_BOOL_EXEC(graph_2_graph_partition_info_.find(original_graph) == graph_2_graph_partition_info_.end(),
  206. REPORT_INNER_ERROR("E19999", "graph:%s not find in graph_2_graph_partition_info_, check invalid.",
  207. original_graph->GetName().c_str());
  208. GELOGE(FAILED, "[Check][Param] Find graph info failed, graph name is %s",
  209. original_graph->GetName().c_str());
  210. return FAILED;)
  211. auto graph_info = graph_2_graph_partition_info_[original_graph];
  212. GE_IF_BOOL_EXEC(graph_info.corresponding_node_in_partitions_.count(parent_node) == 0,
  213. REPORT_INNER_ERROR("E19999", "node:%s not find in corresponding_node_in_partitions_, "
  214. "check invalid", parent_node->GetName().c_str());
  215. GELOGE(FAILED, "[Check][Param] Find corresponding node failed, parent node name is %s",
  216. parent_node->GetName().c_str());
  217. return FAILED;)
  218. auto corresponding_node = graph_info.corresponding_node_in_partitions_[parent_node];
  219. GE_IF_BOOL_EXEC(corresponding_node == nullptr,
  220. REPORT_INNER_ERROR("E19999", "Get null node in corresponding_node_in_partitions_, "
  221. "first node name is %s", parent_node->GetName().c_str());
  222. GELOGE(FAILED, "[Check][Param] Get null node in corresponding_node_in_partitions_, "
  223. "first node name is %s", parent_node->GetName().c_str());
  224. return FAILED;);
  225. merged_sub_graph->SetParentNode(corresponding_node);
  226. auto subgraph_parent_graph = corresponding_node->GetOwnerComputeGraph();
  227. merged_sub_graph->SetParentGraph(subgraph_parent_graph);
  228. ret = output_merged_compute_graph->AddSubgraph(sub_graph->GetName(), merged_sub_graph);
  229. GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, return ret;)
  230. }
  231. ClearAllPartitionData();
  232. if (real_ret != SUCCESS) {
  233. auto root_graph = ge::GraphUtils::FindRootGraph(original_compute_graph);
  234. GE_CHECK_NOTNULL(root_graph);
  235. (void)Analyzer::GetInstance()->SaveAnalyzerDataToFile(root_graph->GetSessionID(), root_graph->GetGraphID());
  236. }
  237. return real_ret;
  238. }
  239. Status ge::GraphPartitioner::MergeSubGraph(ge::ComputeGraphPtr &output_merged_compute_graph,
  240. const ge::ComputeGraphPtr &original_compute_graph) {
  241. if (original_compute_graph == nullptr) {
  242. REPORT_INNER_ERROR("E19999", "Param original_compute_graph is nullptr, check invalid.");
  243. GELOGE(GE_GRAPH_NULL_INPUT, "[Check][Param] original_compute_graph is nullptr.");
  244. return FAILED;
  245. }
  246. if ((graph_2_graph_partition_info_.find(original_compute_graph) == graph_2_graph_partition_info_.end()) ||
  247. (graph_2_subgraph_list_.find(original_compute_graph) == graph_2_subgraph_list_.end())) {
  248. GELOGW("[GraphPartition]: compute_graph has not found, just return original.");
  249. output_merged_compute_graph = original_compute_graph;
  250. return SUCCESS;
  251. }
  252. GraphPartitionInfo &subgraph_info = graph_2_graph_partition_info_[original_compute_graph];
  253. const auto &sub_graph_list = graph_2_subgraph_list_[original_compute_graph];
  254. graph_info_ = subgraph_info;
  255. if (graph_info_.mode_ != kMerging) {
  256. REPORT_INNER_ERROR("E19999", "Cannot call merging in partition mode, as mode != %d", kMerging);
  257. GELOGE(GE_GRAPH_UNSUPPORTED, "[Check][Param] Cannot call merging in partition mode, as mode != %d", kMerging);
  258. return FAILED;
  259. }
  260. GELOGD("Graph merge starts.");
  261. // check input param
  262. for (const auto &it : sub_graph_list) {
  263. if (it == nullptr) {
  264. REPORT_INNER_ERROR("E19999", "sub_graph is nullptr, check invalid.");
  265. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] merging sub-graphs failed, sub-graph is nullptr");
  266. return FAILED;
  267. }
  268. }
  269. bool is_map_empty = graph_info_.end_2_pld_.empty() || graph_info_.pld_2_end_.empty();
  270. if (is_map_empty) {
  271. if (CheckIfEnd2PldEmpty(output_merged_compute_graph) != SUCCESS) {
  272. return FAILED;
  273. }
  274. }
  275. ComputeGraphPtr new_sub_graph = MakeShared<ComputeGraph>(original_compute_graph->GetName());
  276. GE_CHECK_NOTNULL(new_sub_graph);
  277. output_merged_compute_graph = new_sub_graph;
  278. GE_TIMESTAMP_START(MergeSubGraphRemoveNode);
  279. if (RemoveNodeAndEdgeBetweenEndPld(output_merged_compute_graph, sub_graph_list) != ge::SUCCESS) {
  280. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Call][RemoveNodeAndEdgeBetweenEndPld] failed, graph:%s",
  281. output_merged_compute_graph->GetName().c_str());
  282. return FAILED;
  283. }
  284. GE_TIMESTAMP_END(MergeSubGraphRemoveNode, "GraphPartitioner::MergeGraphRemoveNodeAndEdge");
  285. GE_TIMESTAMP_START(MergeSubGraphTopologicalSorting);
  286. Status ret = output_merged_compute_graph->TopologicalSorting();
  287. if (ret != SUCCESS) {
  288. GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[Call][TopologicalSorting] for output_merged_compute_graph:%s failed",
  289. output_merged_compute_graph->GetName().c_str());
  290. return FAILED;
  291. }
  292. GE_TIMESTAMP_END(MergeSubGraphTopologicalSorting, "GraphPartitioner::MergeGraphTopologicalSorting");
  293. // flush all nodes' engine of merged graph
  294. GE_TIMESTAMP_START(MergeSubGraphEnginePlacerRun);
  295. graph_info_.engine_placer_.SetComputeGraph(output_merged_compute_graph);
  296. if (graph_info_.engine_placer_.Run() != SUCCESS) {
  297. GELOGE(GE_GRAPH_INIT_FAILED, "[Call][Run] engine_placer run failed, graph:%s",
  298. output_merged_compute_graph->GetName().c_str());
  299. return FAILED;
  300. }
  301. GE_TIMESTAMP_END(MergeSubGraphEnginePlacerRun, "GraphPartitioner::MergeGraphEnginePlacerRun");
  302. GELOGD("Graph merge ends.");
  303. return SUCCESS;
  304. }
  305. Status ge::GraphPartitioner::UpdatePldOpDesc(const NodePtr &dst_node, int input_index, OpDescPtr &pld_op_desc) {
  306. if ((dst_node == nullptr) || (pld_op_desc == nullptr) || (dst_node->GetOpDesc() == nullptr)) {
  307. REPORT_INNER_ERROR("E19999", "Param dst_node or pld_op_desc or op of dst_node is nullptr, check invalid");
  308. GELOGE(FAILED, "[Check][Param] parameter ptr is null.");
  309. return FAILED;
  310. }
  311. const auto &input_desc = dst_node->GetOpDesc()->GetInputDesc(static_cast<uint32_t>(input_index));
  312. GE_IF_BOOL_EXEC(pld_op_desc->AddOutputDesc(input_desc) != GRAPH_SUCCESS,
  313. REPORT_CALL_ERROR("E19999", "AddOutputDesc to op:%s failed", pld_op_desc->GetName().c_str());
  314. GELOGE(FAILED, "[Add][OutputDesc] to op:%s failed", pld_op_desc->GetName().c_str());
  315. return FAILED;)
  316. if (pld_op_desc->MutableOutputDesc(0) != nullptr) {
  317. ge::TensorUtils::SetRealDimCnt(*(pld_op_desc->MutableOutputDesc(0).get()),
  318. static_cast<uint32_t>(input_desc.GetShape().GetDims().size()));
  319. } else {
  320. REPORT_INNER_ERROR("E19999", "output(0) of op:%s is nullptr, check invalid", pld_op_desc->GetName().c_str());
  321. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Check][Param] output(0) of op:%s is nullptr.",
  322. pld_op_desc->GetName().c_str());
  323. return FAILED;
  324. }
  325. return SUCCESS;
  326. }
  327. Status ge::GraphPartitioner::UpdateEndOpDesc(const NodePtr &src_node, int output_index, OpDescPtr &end_op_desc) {
  328. if ((src_node == nullptr) || (end_op_desc == nullptr) || (src_node->GetOpDesc() == nullptr)) {
  329. REPORT_INNER_ERROR("E19999", "Param src_node or end_op_desc or op of src_node is nullptr, check invalid.");
  330. GELOGE(FAILED, "[Check][Param] parameter ptr is null.");
  331. return FAILED;
  332. }
  333. const auto &output_desc = src_node->GetOpDesc()->GetOutputDesc(static_cast<uint32_t>(output_index));
  334. GE_IF_BOOL_EXEC(end_op_desc->AddInputDesc(output_desc) != GRAPH_SUCCESS,
  335. REPORT_CALL_ERROR("E19999", "AddInputDesc to op:%s failed", end_op_desc->GetName().c_str());
  336. GELOGE(FAILED, "[Add][InputDesc] to op:%s failed", end_op_desc->GetName().c_str());
  337. return FAILED;)
  338. if (end_op_desc->MutableInputDesc(0) != nullptr) {
  339. ge::TensorUtils::SetRealDimCnt(*(end_op_desc->MutableInputDesc(0).get()),
  340. static_cast<uint32_t>(output_desc.GetShape().GetDims().size()));
  341. } else {
  342. REPORT_INNER_ERROR("E19999", "input(0) of op:%s is nullptr, check invalid.", end_op_desc->GetName().c_str());
  343. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Check][Param] input(0) of op:%s is nullptr.",
  344. end_op_desc->GetName().c_str());
  345. return FAILED;
  346. }
  347. return SUCCESS;
  348. }
  349. graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr &out_anchor,
  350. const AnchorPtr &peer_in_anchor,
  351. const ge::ComputeGraphPtr &pld_graph,
  352. const ge::ComputeGraphPtr &end_graph) {
  353. GE_CHECK_NOTNULL(peer_in_anchor);
  354. GE_CHECK_NOTNULL(pld_graph);
  355. GE_CHECK_NOTNULL(out_anchor);
  356. GE_CHECK_NOTNULL(end_graph);
  357. const auto &src_node = out_anchor->GetOwnerNode();
  358. const auto &dst_node = peer_in_anchor->GetOwnerNode();
  359. // link input -> end
  360. string end_name = kEndType + std::to_string(graph_info_.num_of_pld_end_);
  361. auto end_op_desc = MakeShared<OpDesc>(end_graph->GetName() + "_" + end_name, END);
  362. if (end_op_desc == nullptr) {
  363. REPORT_CALL_ERROR("E19999", "New Memory for OpDesc failed.");
  364. GELOGE(GRAPH_PARAM_INVALID, "[New][Memory] for OpDesc failed, pld_op_desc is nullptr.");
  365. return FAILED;
  366. }
  367. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(end_op_desc, "peerIndex", graph_info_.num_of_pld_end_),
  368. GELOGW("SetInt peerIndex failed");)
  369. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(end_op_desc, "parentOpType", dst_node->GetType()),
  370. GELOGW("SetStr parentOpType failed");)
  371. GE_IF_BOOL_EXEC(!end_op_desc->SetExtAttr("parentNode", dst_node),
  372. GELOGW("SetEndExtAttr parentNode failed");)
  373. OpDescPtr dst_node_op_desc = dst_node->GetOpDesc();
  374. GE_CHECK_NOTNULL(dst_node_op_desc);
  375. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(end_op_desc, ATTR_NAME_END_REAR_NODE_ENGINE_NAME,
  376. dst_node_op_desc->GetOpEngineName()), GELOGW("SetStr rearNodeEngineName failed");)
  377. // replace input_desc of end with owner node's desc
  378. int output_index = ge::AnchorUtils::GetIdx(out_anchor);
  379. bool is_need_update_desc = (output_index >= 0) && (graph_info_.mode_ == kPartitioning);
  380. if (is_need_update_desc) {
  381. if (UpdateEndOpDesc(src_node, output_index, end_op_desc) != SUCCESS) {
  382. GELOGE(GRAPH_PARAM_INVALID, "[Update][EndOpDesc] failed, input index:%d, end_op_desc:%s",
  383. output_index, end_op_desc->GetName().c_str());
  384. return FAILED;
  385. }
  386. } else {
  387. GeTensorDesc input_desc;
  388. if (end_op_desc->AddInputDesc(input_desc) != SUCCESS) {
  389. REPORT_CALL_ERROR("E19999", "add input desc to op:%s failed, input index:%d",
  390. end_op_desc->GetName().c_str(), output_index);
  391. GELOGE(GRAPH_PARAM_INVALID, "[Add][InputDesc] to op:%s failed, input index %d",
  392. end_op_desc->GetName().c_str(), output_index);
  393. return FAILED;
  394. }
  395. }
  396. NodePtr new_end_node = end_graph->AddNode(end_op_desc);
  397. if (new_end_node == nullptr) {
  398. REPORT_CALL_ERROR("E19999", "add node:%s in graph:%s failed",
  399. end_op_desc->GetName().c_str(), end_graph->GetName().c_str());
  400. GELOGE(GRAPH_PARAM_INVALID, "[Add][Node] %s in graph:%s failed.",
  401. end_op_desc->GetName().c_str(), end_graph->GetName().c_str());
  402. return FAILED;
  403. }
  404. GE_IF_BOOL_EXEC(new_end_node->SetOwnerComputeGraph(end_graph) != GRAPH_SUCCESS,
  405. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph %s for node:%s failed",
  406. end_graph->GetName().c_str(), new_end_node->GetName().c_str());
  407. GELOGE(GRAPH_PARAM_INVALID, "[Set][OwnerComputeGraph] %s for node:%s failed",
  408. end_graph->GetName().c_str(), new_end_node->GetName().c_str());
  409. return FAILED;)
  410. AnchorPtr end_dst_anchor = GetEndInAnchor(out_anchor, new_end_node);
  411. if (GraphUtils::AddEdge(out_anchor, end_dst_anchor) != GRAPH_SUCCESS) {
  412. REPORT_CALL_ERROR("E19999", "add edge from %s to %s failed", out_anchor->GetOwnerNode()->GetName().c_str(),
  413. end_dst_anchor->GetOwnerNode()->GetName().c_str());
  414. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Add][Edge] from %s to %s failed",
  415. out_anchor->GetOwnerNode()->GetName().c_str(), end_dst_anchor->GetOwnerNode()->GetName().c_str());
  416. return FAILED;
  417. }
  418. /// For fe, op id has been set in AddNode,
  419. /// we can take op id of srcNode as the mark of parentId now
  420. const auto &src_node_opdesc = src_node->GetOpDesc();
  421. GE_CHECK_NOTNULL(src_node_opdesc);
  422. int64_t node_id = src_node_opdesc->GetId();
  423. const string pld_name = kPlaceHolderType + std::to_string(graph_info_.num_of_pld_end_);
  424. auto pld_op_desc = MakeShared<OpDesc>(pld_graph->GetName() + "_" + pld_name, PLACEHOLDER);
  425. if (pld_op_desc == nullptr) {
  426. REPORT_CALL_ERROR("E19999", "New Memory for OpDesc failed.");
  427. GELOGE(GRAPH_PARAM_INVALID, "[New][Memory] for OpDesc failed.");
  428. return FAILED;
  429. }
  430. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "peerIndex", graph_info_.num_of_pld_end_),
  431. GELOGW("SetInt peerIndex failed");)
  432. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_peerNodeName", new_end_node->GetName()),
  433. GELOGW("SetStr _peerNodeName failed");)
  434. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentOpType", src_node->GetType()),
  435. GELOGW("SetStr parentOpType failed");)
  436. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_parentNodeName", src_node->GetName()),
  437. GELOGW("SetStr parentOpName failed");)
  438. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentId", end_graph->GetName() + ":" + std::to_string(node_id)),
  439. GELOGW("SetStr parentId failed");)
  440. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "anchorIndex", AnchorUtils::GetIdx(out_anchor)),
  441. GELOGW("SetInt anchorIndex failed");)
  442. GE_IF_BOOL_EXEC(!pld_op_desc->SetExtAttr("parentNode", src_node),
  443. GELOGW("SetPldExtAttr parentNode failed");)
  444. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, ATTR_NAME_PLD_FRONT_NODE_ENGINE_NAME,
  445. src_node_opdesc->GetOpEngineName()), GELOGW("SetStr frontNodeEngineName failed");)
  446. std::string l2_info_attr;
  447. if (AttrUtils::GetStr(src_node_opdesc, "_task_L2FusionInfo", l2_info_attr)) {
  448. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "_task_L2FusionInfo", l2_info_attr),
  449. GELOGW("SetStr l2_info_attr failed");)
  450. }
  451. int64_t anchor_index_for_lxfusion;
  452. if (AttrUtils::GetInt(src_node_opdesc, "_data_anchor_index_for_lxfusion", anchor_index_for_lxfusion)) {
  453. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "_data_anchor_index_for_lxfusion", anchor_index_for_lxfusion),
  454. GELOGW("SetInt anchor_index_for_lxfusion failed");)
  455. }
  456. // do not care over flow
  457. graph_info_.num_of_pld_end_++;
  458. // replace output_desc of pld with input node's output desc
  459. int input_index = ge::AnchorUtils::GetIdx(peer_in_anchor);
  460. is_need_update_desc = (input_index >= 0) && (graph_info_.mode_ == kPartitioning);
  461. if (is_need_update_desc) {
  462. if (UpdatePldOpDesc(dst_node, input_index, pld_op_desc) != SUCCESS) {
  463. GELOGE(GRAPH_PARAM_INVALID, "[Update][PldOpDesc] failed, output index:%d, pld_op_desc:%s",
  464. input_index, pld_op_desc->GetName().c_str());
  465. return FAILED;
  466. }
  467. } else {
  468. GeTensorDesc output_desc;
  469. if (pld_op_desc->AddOutputDesc(output_desc) != SUCCESS) {
  470. REPORT_CALL_ERROR("E19999", "AddOutputDesc to op:%s failed, input index %d",
  471. pld_op_desc->GetName().c_str(), input_index);
  472. GELOGE(GRAPH_PARAM_INVALID, "[Add][OutputDesc] to op:%s failed, input index %d",
  473. pld_op_desc->GetName().c_str(), input_index);
  474. return FAILED;
  475. }
  476. }
  477. NodePtr new_pld_node = pld_graph->AddNode(pld_op_desc);
  478. if (new_pld_node == nullptr) {
  479. REPORT_CALL_ERROR("E19999", "AddNode %s in graph:%s failed.",
  480. pld_op_desc->GetName().c_str(), pld_graph->GetName().c_str());
  481. GELOGE(GRAPH_PARAM_INVALID, "[Add][Node] %s in graph:%s failed.",
  482. pld_op_desc->GetName().c_str(), pld_graph->GetName().c_str());
  483. return FAILED;
  484. }
  485. GE_IF_BOOL_EXEC(new_pld_node->SetOwnerComputeGraph(pld_graph) != GRAPH_SUCCESS,
  486. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph for node:%s failed, graph:%s",
  487. new_pld_node->GetName().c_str(), pld_graph->GetName().c_str());
  488. GELOGE(GRAPH_PARAM_INVALID, "[Set][OwnerComputeGraph] for node:%s failed, graph:%s",
  489. new_pld_node->GetName().c_str(), pld_graph->GetName().c_str());
  490. return FAILED;)
  491. AnchorPtr pld_src_anchor = GetPldOutAnchor(new_pld_node, peer_in_anchor);
  492. // link placeHolder -> computeNode
  493. if (GraphUtils::AddEdge(pld_src_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
  494. REPORT_CALL_ERROR("E19999", "AddEdge from %s to %s failed",
  495. pld_src_anchor->GetOwnerNode()->GetName().c_str(),
  496. peer_in_anchor->GetOwnerNode()->GetName().c_str());
  497. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Add][Edge] from %s to %s failed",
  498. pld_src_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str());
  499. return FAILED;
  500. }
  501. graph_info_.index_2_end_[graph_info_.num_of_pld_end_] = new_end_node;
  502. graph_info_.pld_2_end_[new_pld_node] = new_end_node;
  503. graph_info_.end_2_pld_[new_end_node] = new_pld_node;
  504. return SUCCESS;
  505. }
  506. Status ge::GraphPartitioner::LinkInput2EndRemoveOrginalLink(ge::NodePtr input_node, ge::ComputeGraphPtr src_graph,
  507. ge::ComputeGraphPtr dst_graph) {
  508. if ((input_node == nullptr) || (src_graph == nullptr) || (dst_graph == nullptr)) {
  509. REPORT_INNER_ERROR("E19999", "Param input_node or src_graph or dst_graph is nullptr, check invalid.");
  510. GELOGE(FAILED, "[Check][Param] parameter input_node or src_graph or dst_graph is nullptr.");
  511. return FAILED;
  512. }
  513. // get the original anchors and remove the original link
  514. for (const auto &out_data_anchor : input_node->GetAllOutAnchors()) {
  515. for (auto &peer_in_anchor : out_data_anchor->GetPeerAnchors()) {
  516. if (peer_in_anchor->GetOwnerNode()->GetType() != kEndType) {
  517. if (GraphUtils::RemoveEdge(out_data_anchor, peer_in_anchor) != GRAPH_SUCCESS) {
  518. REPORT_CALL_ERROR("E19999", "RemoveEdge between %s and %s failed.",
  519. out_data_anchor->GetOwnerNode()->GetName().c_str(),
  520. peer_in_anchor->GetOwnerNode()->GetName().c_str());
  521. GELOGE(FAILED, "[Remove][Edge] between %s and %s failed.",
  522. out_data_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str());
  523. return FAILED;
  524. }
  525. // link input -> end
  526. auto ret = AddPlaceHolderEndInSrcDstGraph(out_data_anchor, peer_in_anchor, src_graph, dst_graph);
  527. if (ret != SUCCESS) {
  528. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Call][AddPlaceHolderEndInSrcDstGraph] failed, ret:%d.", ret);
  529. return ret;
  530. }
  531. } else {
  532. auto end_node = peer_in_anchor->GetOwnerNode();
  533. if (GraphUtils::RemoveJustNode(src_graph, end_node) != GRAPH_SUCCESS) {
  534. REPORT_CALL_ERROR("E19999", "RemoveJustNode %s from graph:%s failed.",
  535. end_node->GetName().c_str(), src_graph->GetName().c_str());
  536. GELOGE(FAILED, "[Remove][JustNode] %s from graph:%s failed.",
  537. end_node->GetName().c_str(), src_graph->GetName().c_str());
  538. return FAILED;
  539. }
  540. if (end_node->SetOwnerComputeGraph(dst_graph) != GRAPH_SUCCESS) {
  541. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph for node:%s failed, graph:%s.",
  542. end_node->GetName().c_str(), dst_graph->GetName().c_str());
  543. GELOGE(FAILED, "[Set][OwnerComputeGraph] to node:%s failed, graph:%s.",
  544. end_node->GetName().c_str(), dst_graph->GetName().c_str());
  545. return FAILED;
  546. }
  547. if (dst_graph->AddNode(end_node) == nullptr) {
  548. REPORT_CALL_ERROR("E19999", "AddNode %s in graph:%s failed.",
  549. end_node->GetName().c_str(), dst_graph->GetName().c_str());
  550. GELOGE(FAILED, "[Add][Node] %s in graph:%s failed.",
  551. end_node->GetName().c_str(), dst_graph->GetName().c_str());
  552. return FAILED;
  553. }
  554. }
  555. }
  556. }
  557. return SUCCESS;
  558. }
  559. Status ge::GraphPartitioner::PutInputNodesInSubGraph(const ge::ComputeGraphPtr &src_graph,
  560. const ge::ComputeGraphPtr &dst_graph) {
  561. if ((src_graph == nullptr) || (dst_graph == nullptr)) {
  562. REPORT_INNER_ERROR("E19999", "Param src_graph or dst_graph is nullptr, check invalid.");
  563. GELOGE(FAILED, "[Check][Param] parameter src_graph or dst_graph is nullptr.");
  564. return FAILED;
  565. }
  566. for (auto &input_node : src_graph->GetDirectNode()) {
  567. if (IsDataLike(input_node)) {
  568. if (input_node->SetOwnerComputeGraph(dst_graph) != GRAPH_SUCCESS) {
  569. REPORT_CALL_ERROR("E19999", "SetOwnerComputeGraph for node:%s failed, graph:%s.",
  570. input_node->GetName().c_str(), dst_graph->GetName().c_str());
  571. GELOGE(FAILED, "[Set][OwnerComputeGraph] for node:%s failed, graph:%s.",
  572. input_node->GetName().c_str(), dst_graph->GetName().c_str());
  573. return FAILED;
  574. }
  575. // remove input node from src_graph
  576. if (GraphUtils::RemoveJustNode(src_graph, input_node) != GRAPH_SUCCESS) {
  577. REPORT_CALL_ERROR("E19999", "RemoveJustNode %s from graph:%s failed.",
  578. input_node->GetName().c_str(), src_graph->GetName().c_str());
  579. GELOGE(FAILED, "[Remove][JustNode] %s from graph:%s failed.",
  580. input_node->GetName().c_str(), src_graph->GetName().c_str());
  581. return FAILED;
  582. }
  583. // add input node to dst_graph
  584. if (dst_graph->AddNode(input_node) == nullptr) {
  585. REPORT_CALL_ERROR("E19999", "AddNode %s in graph:%s failed.",
  586. input_node->GetName().c_str(), src_graph->GetName().c_str());
  587. GELOGE(FAILED, "[Add][Node] %s in graph:%s failed.",
  588. input_node->GetName().c_str(), src_graph->GetName().c_str());
  589. return FAILED;
  590. }
  591. if (LinkInput2EndRemoveOrginalLink(input_node, src_graph, dst_graph) != ge::SUCCESS) {
  592. GELOGE(FAILED, "[Call][LinkInput2EndRemoveOrginalLink] failed.");
  593. return FAILED;
  594. }
  595. }
  596. }
  597. return SUCCESS;
  598. }
  599. void ge::GraphPartitioner::AddNewGraphToPartition(ge::ComputeGraphPtr &input_graph, const std::string &engine_name) {
  600. if (input_graph == nullptr) {
  601. GELOGW("[GraphPartitioner]: input_graph is null, engine name is %s", engine_name.c_str());
  602. return;
  603. }
  604. graph_info_.partitions_[input_graph] = engine_name;
  605. }
  606. bool ge::GraphPartitioner::IsDataLike(ge::NodePtr node) {
  607. return (node->GetType() == CONSTANT) || (node->GetType() == DATA) || (node->GetType() == AIPPDATA) ||
  608. (node->GetType() == CONSTANTOP) || (node->GetType() == VARIABLE);
  609. }
  610. bool ge::GraphPartitioner::HasNoInput(ge::NodePtr node) {
  611. if (node == nullptr) {
  612. GELOGE(FAILED, "[Check][Param] node is nullptr.");
  613. return true;
  614. }
  615. return node->GetInNodes().empty();
  616. }
  617. Status ge::GraphPartitioner::Initialize(ge::ComputeGraphPtr compute_graph) {
  618. GELOGI("Initialize starts.");
  619. std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  620. if (instance_ptr == nullptr || compute_graph == nullptr) {
  621. REPORT_INNER_ERROR("E19999", "compute_graph or instance_ptr of GELib is nullptr, check invalid.");
  622. GELOGE(GE_GRAPH_NOT_INIT, "[Check][Param] compute_graph or instance_ptr of GELib is nullptr.");
  623. return FAILED;
  624. }
  625. graph_info_.engine_placer_.SetComputeGraph(compute_graph);
  626. if (graph_info_.engine_placer_.Run() != SUCCESS) {
  627. GELOGE(FAILED, "[Call][Run] Engine placer run failed, graph:%s.", compute_graph->GetName().c_str());
  628. return FAILED;
  629. }
  630. const NodeEngineMap *node_engine_map = graph_info_.engine_placer_.GetNodeEngineMap();
  631. size_t temp_index = 0;
  632. // travese nodes by topo order one by one
  633. for (const auto &node : compute_graph->GetDirectNode()) {
  634. std::string temp_stream;
  635. // node opdesc has been checked before
  636. (void)AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, temp_stream);
  637. ClusterPtr new_cluster;
  638. // data like node without input should be handle specific
  639. if (HasNoInput(node) && IsDataLike(node)) {
  640. ClusterPtr cluster = MakeShared<Cluster>(temp_index, kEngineDefaultData, temp_stream);
  641. new_cluster = cluster;
  642. } else {
  643. if (node_engine_map->count(node) == 0) {
  644. REPORT_INNER_ERROR("E19999", "node:%s not find in node_engine_map", node->GetName().c_str());
  645. GELOGE(FAILED, "[Check][Param] node[%s] does not owner engine!", node->GetName().c_str());
  646. return FAILED;
  647. }
  648. ClusterPtr cluster = MakeShared<Cluster>(temp_index, node_engine_map->at(node), temp_stream);
  649. new_cluster = cluster;
  650. }
  651. if (new_cluster == nullptr) {
  652. REPORT_CALL_ERROR("E19999", "Allocate Cluster failed, index:%zu", temp_index);
  653. GELOGE(FAILED, "[Allocate][Cluster] failed, index:%zu", temp_index);
  654. return FAILED;
  655. }
  656. new_cluster->nodes_.push_back(node);
  657. if (!HasNoInput(node)) {
  658. auto node_id = node->GetOpDesc()->GetId();
  659. for (const auto &parent : node->GetInAllNodes()) {
  660. auto parent_id = parent->GetOpDesc()->GetId();
  661. if (parent_id < node_id) {
  662. auto iter = graph_info_.node_2_cluster_.find(parent);
  663. if (iter == graph_info_.node_2_cluster_.end()) {
  664. REPORT_INNER_ERROR("E19999", "node[%s]id[%ld]'s parent_node[%s]id[%ld] should make cluster in advance",
  665. node->GetOpDesc()->GetName().c_str(), node_id,
  666. parent->GetOpDesc()->GetName().c_str(), parent_id);
  667. GELOGE(FAILED, "[Check][Param] node[%s]id[%ld]'s parent_node[%s]id[%ld] should make cluster in advance",
  668. node->GetOpDesc()->GetName().c_str(), node_id, parent->GetOpDesc()->GetName().c_str(), parent_id);
  669. return FAILED;
  670. }
  671. new_cluster->in_clu_.insert(iter->second->index_);
  672. iter->second->out_clu_.insert(temp_index);
  673. }
  674. }
  675. }
  676. graph_info_.node_2_cluster_[node] = new_cluster;
  677. graph_info_.clusters_[temp_index] = new_cluster;
  678. GELOGD("Node name is %s, engine is %s, cluster index is %zu, stream label is %s", node->GetName().c_str(),
  679. new_cluster->engine_name_.c_str(), new_cluster->index_, new_cluster->stream_label_.c_str());
  680. temp_index++;
  681. }
  682. GELOGD("Initialize ends.");
  683. return SUCCESS;
  684. }
  685. Status ge::GraphPartitioner::AddPartitionsToGraphNode(vector<ge::SubGraphInfoPtr> &output_subgraphs,
  686. ge::ComputeGraphPtr compute_graph) {
  687. const std::string &input_subgraph_name = "inputNodesSubGraph";
  688. string session_graph_id;
  689. if (!AttrUtils::GetStr(*compute_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) {
  690. GELOGW("Get graph session_graph_id attr failed.");
  691. return INTERNAL_ERROR;
  692. }
  693. // the output_subgraphs have topological order
  694. for (const auto &sub_graph : graph_info_.rank_2_partitions_) {
  695. if (graph_info_.partitions_.find(sub_graph) == graph_info_.partitions_.end()) {
  696. REPORT_INNER_ERROR("E19999", "partition is null, subgraph:%s", sub_graph->GetName().c_str());
  697. GELOGE(GE_GRAPH_EMPTY_PARTITION, "[Check][Param] partition is null, subgraph:%s", sub_graph->GetName().c_str());
  698. return FAILED;
  699. }
  700. auto &engine_name = graph_info_.partitions_.at(sub_graph);
  701. (void)AttrUtils::SetStr(sub_graph, ATTR_NAME_PARENT_GRAPH_NAME, compute_graph->GetName());
  702. (void)sub_graph->SetExtAttr("part_src_graph", compute_graph);
  703. GELOGD("set attr success. subgraph(%s) with parent graph(%s)", sub_graph->GetName().c_str(),
  704. compute_graph->GetName().c_str());
  705. GE_DUMP(sub_graph, sub_graph->GetName() + "_" + mode_2_str_[graph_info_.mode_]);
  706. if (!session_graph_id.empty()) {
  707. GE_IF_BOOL_EXEC(!AttrUtils::SetStr(sub_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id),
  708. GELOGW("SetStr ATTR_NAME_SESSION_GRAPH_ID failed");)
  709. }
  710. // flush parent node of subgraph
  711. sub_graph->SetParentNode(compute_graph->GetParentNode());
  712. auto sgi = MakeShared<SubGraphInfo>();
  713. if (sgi == nullptr) {
  714. REPORT_CALL_ERROR("E19999", "allocate memory for SubGraphInfo failed.");
  715. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Allocate][Memory] for SubGraphInfo failed.");
  716. return FAILED;
  717. }
  718. // set engine name
  719. sgi->SetEngineName(engine_name);
  720. // set stream label
  721. string sub_graph_stream;
  722. if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) {
  723. sgi->SetStreamLabel(sub_graph_stream);
  724. }
  725. /// for now inputFlag is the same before and after partition. It should
  726. /// be changed according to the real partition
  727. std::vector<bool> sub_graph_input(graph_info_.input_size_, true);
  728. std::vector<bool> sub_graph_output(graph_info_.output_size_, true);
  729. sgi->SetSubGraph(sub_graph);
  730. sgi->SetOutputFlag(sub_graph_output);
  731. sgi->SetInputFlag(sub_graph_input);
  732. sgi->SetOutputContext(graph_info_.output_name_);
  733. AddEndPldInformationToSubGraphInfo(sgi);
  734. GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s", engine_name.c_str(),
  735. sub_graph->GetName().c_str(), sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str());
  736. if (engine_name != input_subgraph_name) { // do not add Data subGraph into SubGraphInfo
  737. output_subgraphs.push_back(sgi);
  738. } else {
  739. graph_2_input_subgraph_[compute_graph] = sgi;
  740. }
  741. }
  742. return SUCCESS;
  743. }
  744. // check if two clusters can merge
  745. bool ge::GraphPartitioner::IsMergeable(size_t parent_cluster, size_t child_cluster, size_t upper_bound) {
  746. if ((graph_info_.clusters_[parent_cluster] == nullptr) || (graph_info_.clusters_[parent_cluster]->nodes_.empty()) ||
  747. (graph_info_.clusters_[child_cluster] == nullptr) || (graph_info_.clusters_[child_cluster]->nodes_.empty())) {
  748. return false;
  749. }
  750. // Check if parent_cluster,child_cluster has same engine or stream label
  751. if ((graph_info_.clusters_[parent_cluster]->engine_name_ != graph_info_.clusters_[child_cluster]->engine_name_) ||
  752. (graph_info_.clusters_[parent_cluster]->stream_label_ != graph_info_.clusters_[child_cluster]->stream_label_)) {
  753. GELOGD("Parent cluster %zu engine %s stream label %s, child cluster %zu engine %s stream label %s can not merge",
  754. parent_cluster, graph_info_.clusters_[parent_cluster]->engine_name_.c_str(),
  755. graph_info_.clusters_[parent_cluster]->stream_label_.c_str(), child_cluster,
  756. graph_info_.clusters_[child_cluster]->engine_name_.c_str(),
  757. graph_info_.clusters_[child_cluster]->stream_label_.c_str());
  758. return false;
  759. }
  760. // Check if parent_cluster,child_cluster is reachable
  761. RemoveEdge(parent_cluster, child_cluster);
  762. // Check if there is a path between parent and child, if return true, can not merge
  763. if (HasSecondPath(parent_cluster, child_cluster, upper_bound)) {
  764. GELOGD("Find second path from %zu to %zu, upper bound is %zu", parent_cluster, child_cluster, upper_bound);
  765. InsertEdge(parent_cluster, child_cluster);
  766. return false;
  767. }
  768. InsertEdge(parent_cluster, child_cluster);
  769. return true;
  770. }
  771. void ge::GraphPartitioner::MergeTwoClusters(size_t parent_cluster, size_t &child_cluster) {
  772. // check which index is bigger
  773. size_t big_cluster, small_cluster;
  774. size_t child_cluster_original = child_cluster;
  775. if (parent_cluster > child_cluster) {
  776. small_cluster = child_cluster;
  777. big_cluster = parent_cluster;
  778. } else {
  779. big_cluster = child_cluster;
  780. small_cluster = parent_cluster;
  781. // flush child_cluster, because it has been modified
  782. child_cluster = small_cluster;
  783. }
  784. // update node_2_cluster_ map
  785. for (auto &node : graph_info_.clusters_[big_cluster]->nodes_) {
  786. graph_info_.node_2_cluster_[node] = graph_info_.clusters_[small_cluster];
  787. }
  788. // merge nodes
  789. graph_info_.clusters_[small_cluster]->nodes_.splice(graph_info_.clusters_[small_cluster]->nodes_.end(),
  790. graph_info_.clusters_[big_cluster]->nodes_);
  791. // merge all input & output to small cluster
  792. graph_info_.clusters_[small_cluster]->in_clu_.insert(graph_info_.clusters_[big_cluster]->in_clu_.begin(),
  793. graph_info_.clusters_[big_cluster]->in_clu_.end());
  794. graph_info_.clusters_[small_cluster]->out_clu_.insert(graph_info_.clusters_[big_cluster]->out_clu_.begin(),
  795. graph_info_.clusters_[big_cluster]->out_clu_.end());
  796. // remove child_cluster's out parent_cluster's in between child_cluster and parent_cluster
  797. RemoveEdge(parent_cluster, child_cluster_original);
  798. // update in/out of the cluster with bigger index
  799. for (auto in_clu : graph_info_.clusters_[big_cluster]->in_clu_) {
  800. graph_info_.clusters_[in_clu]->out_clu_.insert(small_cluster);
  801. graph_info_.clusters_[in_clu]->out_clu_.erase(big_cluster);
  802. }
  803. for (auto out_clu : graph_info_.clusters_[big_cluster]->out_clu_) {
  804. graph_info_.clusters_[out_clu]->in_clu_.insert(small_cluster);
  805. graph_info_.clusters_[out_clu]->in_clu_.erase(big_cluster);
  806. }
  807. graph_info_.clusters_[big_cluster] = graph_info_.clusters_[small_cluster];
  808. }
  809. void ge::GraphPartitioner::RemoveEdge(size_t parent_cluster, size_t child_cluster) {
  810. graph_info_.clusters_[child_cluster]->in_clu_.erase(parent_cluster);
  811. graph_info_.clusters_[parent_cluster]->out_clu_.erase(child_cluster);
  812. }
  813. void ge::GraphPartitioner::InsertEdge(size_t from, size_t to) {
  814. if (from == to) {
  815. return;
  816. }
  817. if (!graph_info_.clusters_[from]->out_clu_.insert(to).second) {
  818. // edge has already exists
  819. return;
  820. }
  821. graph_info_.clusters_[to]->in_clu_.insert(from);
  822. }
  823. void ge::GraphPartitioner::MarkClusters() {
  824. GELOGI("MarkClusters starts. cluster size is %zu", graph_info_.clusters_.size());
  825. size_t cluster_size = graph_info_.clusters_.size();
  826. for (size_t child_cluster = 0; child_cluster < cluster_size; child_cluster++) {
  827. auto found_child_cluster = graph_info_.clusters_[child_cluster];
  828. if (found_child_cluster == nullptr) {
  829. GELOGW("can not found child_cluster is %zu", child_cluster);
  830. continue;
  831. }
  832. auto copy_parents_clusters = found_child_cluster->in_clu_;
  833. vector<size_t> ordered_cluster;
  834. for (const auto &parent_cluster : copy_parents_clusters) {
  835. ordered_cluster.emplace_back(parent_cluster);
  836. }
  837. // sort cluster according to it's output amount
  838. auto comp_func = [this](const size_t &parent_cluster1, const size_t &parent_cluster2) -> bool {
  839. return graph_info_.clusters_[parent_cluster1]->out_clu_.size() <
  840. graph_info_.clusters_[parent_cluster2]->out_clu_.size();
  841. };
  842. std::sort(ordered_cluster.begin(), ordered_cluster.end(), comp_func);
  843. auto child_merged = child_cluster;
  844. for (const auto &parent_cluster : ordered_cluster) {
  845. if (IsMergeable(parent_cluster, child_merged, child_cluster)) {
  846. MergeTwoClusters(parent_cluster, child_merged);
  847. GELOGD("Merging cluster %zu and %zu to %zu", parent_cluster, child_cluster, child_merged);
  848. }
  849. }
  850. }
  851. GELOGD("MarkClusters ends.");
  852. }
  853. Status ge::GraphPartitioner::SplitSubGraphs(ge::ComputeGraphPtr compute_graph) {
  854. GELOGD("SplitSubGraphs starts.");
  855. if (compute_graph == nullptr) {
  856. REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid");
  857. GELOGE(FAILED, "[Check][Param] parameter ptr is null.");
  858. return FAILED;
  859. }
  860. // Create graphs for all clusters
  861. std::unordered_set<ClusterPtr> cluster_set;
  862. // add pld&end
  863. for (auto &node : compute_graph->GetDirectNode()) {
  864. GELOGD("Node name is %s.", node->GetName().c_str());
  865. auto child_cluster = graph_info_.node_2_cluster_[node];
  866. ge::ComputeGraphPtr corresponding_graph;
  867. // unordered_set's insert returns a pair, second of pair is bool
  868. if (!cluster_set.insert(child_cluster).second) {
  869. GELOGD("Old sub graph, child_cluster is %zu", child_cluster->index_);
  870. corresponding_graph = graph_info_.cluster_2_partition_.at(child_cluster);
  871. } else {
  872. std::string graph_name = "new_sub_graph" + std::to_string(graph_info_.partitions_.size());
  873. ComputeGraphPtr new_sub_graph = MakeShared<ge::ComputeGraph>(graph_name);
  874. if (new_sub_graph == nullptr) {
  875. REPORT_CALL_ERROR("E19999", "allocate memory for ge::ComputeGraph failed.");
  876. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Allocate][Memory] for ge::ComputeGraph failed.");
  877. return FAILED;
  878. }
  879. AddNewGraphToPartition(new_sub_graph, child_cluster->engine_name_);
  880. corresponding_graph = new_sub_graph;
  881. graph_info_.cluster_2_partition_[child_cluster] = corresponding_graph;
  882. GELOGD("New sub graph, name is %s", graph_name.c_str());
  883. }
  884. // build node to corresponding node map
  885. NodePtr corresponding_node = corresponding_graph->AddNode(node->GetOpDesc());
  886. if (corresponding_node == nullptr) {
  887. REPORT_CALL_ERROR("E19999", "add node:%s in graph:%s failed",
  888. node->GetName().c_str(), corresponding_graph->GetName().c_str());
  889. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Add][Node] %s in graph:%s failed.",
  890. node->GetName().c_str(), corresponding_graph->GetName().c_str());
  891. return FAILED;
  892. }
  893. graph_info_.corresponding_node_in_partitions_[node] = corresponding_node;
  894. GE_CHK_STATUS_RET(corresponding_node->SetOwnerComputeGraph(corresponding_graph))
  895. for (const auto &in_anchor : node->GetAllInAnchors()) {
  896. GELOGD("In anchor index is %d", AnchorUtils::GetIdx(in_anchor));
  897. for (auto &peer_out_anchor : in_anchor->GetPeerAnchors()) {
  898. GELOGD("Peer out anchor index is %d", AnchorUtils::GetIdx(peer_out_anchor));
  899. // Normally, all nodes have a copy in corresponding_node_in_partitions_, so function at can not be exception
  900. auto iter = graph_info_.corresponding_node_in_partitions_.find(peer_out_anchor->GetOwnerNode());
  901. if (iter == graph_info_.corresponding_node_in_partitions_.end()) {
  902. REPORT_INNER_ERROR("E19999", "node[%s]id[%ld]'s parent_node[%s]id[%ld]"
  903. "should make corresponding in advance",
  904. node->GetOpDesc()->GetName().c_str(), node->GetOpDesc()->GetId(),
  905. peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(),
  906. peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetId());
  907. GELOGE(GRAPH_FAILED, "[Check][Param] node[%s]id[%ld]'s parent_node[%s]id[%ld]"
  908. "should make corresponding in advance",
  909. node->GetOpDesc()->GetName().c_str(), node->GetOpDesc()->GetId(),
  910. peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(),
  911. peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetId());
  912. return GRAPH_FAILED;
  913. }
  914. auto parent_node = iter->second;
  915. GE_CHECK_NOTNULL(parent_node);
  916. GELOGD("Parent node name is %s", parent_node->GetName().c_str());
  917. // add edge
  918. auto src_anchor = parent_node->GetOutAnchor(AnchorUtils::GetIdx(peer_out_anchor));
  919. auto dst_anchor = corresponding_node->GetInAnchor(AnchorUtils::GetIdx(in_anchor));
  920. // if child and parent's cluster is not same, add plc and end
  921. auto parent_cluster = graph_info_.node_2_cluster_[peer_out_anchor->GetOwnerNode()];
  922. if (parent_cluster != child_cluster) {
  923. GELOGD("Parent cluster is %zu, child_cluster is %zu", parent_cluster->index_, child_cluster->index_);
  924. if (AddPlaceHolderEnd(peer_out_anchor, in_anchor) != ge::SUCCESS) {
  925. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED,
  926. "[Call][AddPlaceHolderEnd] failed, out_anchor:%s index:%d, in_anchor:%s index:%d.",
  927. peer_out_anchor->GetOwnerNode()->GetName().c_str(), AnchorUtils::GetIdx(peer_out_anchor),
  928. in_anchor->GetOwnerNode()->GetName().c_str(), AnchorUtils::GetIdx(in_anchor));
  929. return FAILED;
  930. }
  931. } else { // parent and child in the same cluster, add edge
  932. GELOGD("AddEdge from parent cluster %zu to child %zu", parent_cluster->index_, child_cluster->index_);
  933. if (GraphUtils::AddEdge(src_anchor, dst_anchor) != GRAPH_SUCCESS) {
  934. REPORT_CALL_ERROR("E19999", "add edge from %s to %s failed",
  935. peer_out_anchor->GetOwnerNode()->GetName().c_str(),
  936. in_anchor->GetOwnerNode()->GetName().c_str());
  937. GELOGE(GRAPH_FAILED, "[Add][Edge] from %s to %s failed", peer_out_anchor->GetOwnerNode()->GetName().c_str(),
  938. in_anchor->GetOwnerNode()->GetName().c_str());
  939. return FAILED;
  940. }
  941. }
  942. }
  943. }
  944. }
  945. GELOGD("SplitSubGraphs ends.");
  946. return SUCCESS;
  947. }
  948. /// before calling this function, the direct path between src and dst are already removed.
  949. /// return true if a second path is found
  950. bool ge::GraphPartitioner::HasSecondPath(size_t src, size_t dst, size_t upper_bound) {
  951. if (graph_info_.clusters_.at(src)->out_clu_.empty() || graph_info_.clusters_.at(dst)->in_clu_.empty()) {
  952. return false;
  953. }
  954. /// Avoid recursion since stack space might be limited.
  955. /// We instead keep a stack of nodes to visit.
  956. std::vector<size_t> temp_stack;
  957. std::set<size_t> visited;
  958. temp_stack.push_back(src);
  959. while (!temp_stack.empty()) {
  960. size_t cluster = temp_stack.back();
  961. temp_stack.pop_back();
  962. ClusterPtr cur_cluster = graph_info_.clusters_[cluster];
  963. if (!visited.insert(cluster).second) {
  964. continue;
  965. }
  966. for (auto out : cur_cluster->out_clu_) {
  967. if (out == dst) {
  968. return true; // There is cycle
  969. }
  970. if (out < upper_bound) {
  971. temp_stack.push_back(out);
  972. }
  973. }
  974. }
  975. return false;
  976. }
  977. Status ge::GraphPartitioner::Partition(ge::ComputeGraphPtr compute_graph, Mode mode) {
  978. ClearAllPartitionData();
  979. auto real_ret = SUCCESS;
  980. auto ret = PartitionSubGraph(compute_graph, mode);
  981. if (ret != SUCCESS) {
  982. GELOGE(ret, "[Partition][SubGraph] Failed, ret:%d", ret);
  983. real_ret = ret;
  984. }
  985. GE_CHECK_NOTNULL(compute_graph);
  986. // partition sub graph
  987. for (const auto &sub_graph : compute_graph->GetAllSubgraphs()) {
  988. ret = PartitionSubGraph(sub_graph, mode);
  989. if (ret != SUCCESS) {
  990. GELOGE(ret, "[Partition][SubGraph] Failed, ret:%d", ret);
  991. real_ret = ret;
  992. }
  993. }
  994. if (real_ret != SUCCESS) {
  995. auto root_graph = ge::GraphUtils::FindRootGraph(compute_graph);
  996. GE_CHECK_NOTNULL(root_graph);
  997. (void)Analyzer::GetInstance()->SaveAnalyzerDataToFile(root_graph->GetSessionID(),
  998. root_graph->GetGraphID());
  999. }
  1000. return real_ret;
  1001. }
  1002. Status ge::GraphPartitioner::PartitionSubGraph(ge::ComputeGraphPtr compute_graph, Mode mode) {
  1003. if (compute_graph == nullptr) {
  1004. REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid.");
  1005. GELOGE(GE_GRAPH_NULL_INPUT, "[Check][Param] compute_graph is nullptr.");
  1006. return FAILED;
  1007. }
  1008. // clear graph_info
  1009. graph_info_.ClearAllData(mode);
  1010. graph_info_.output_name_ = compute_graph->GetOutput();
  1011. graph_info_.output_size_ = compute_graph->GetOutputSize();
  1012. graph_info_.input_size_ = compute_graph->GetInputSize();
  1013. if (graph_info_.output_size_ == 0) {
  1014. REPORT_INNER_ERROR("E19999", "the output size of graph:%s is 0, check invalid.",
  1015. compute_graph->GetName().c_str());
  1016. GELOGE(GE_GRAPH_NULL_INPUT, "[Check][Param] The output size:0 of graph:%s need to be greater than 0.",
  1017. compute_graph->GetName().c_str());
  1018. return FAILED;
  1019. }
  1020. GELOGI("Graph Partition starts, graph nodes size is %zu", compute_graph->GetDirectNodesSize());
  1021. Status ret = compute_graph->TopologicalSorting();
  1022. if (ret != SUCCESS) {
  1023. REPORT_CALL_ERROR("E19999", "TopologicalSorting for graph:%s failed",
  1024. compute_graph->GetName().c_str());
  1025. GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[Call][TopologicalSorting] for subGraph:%s failed",
  1026. compute_graph->GetName().c_str());
  1027. return FAILED;
  1028. }
  1029. GE_TIMESTAMP_START(PartitionSubGraphInitialize);
  1030. if (Initialize(compute_graph) != SUCCESS) {
  1031. GELOGE(GE_GRAPH_INIT_FAILED, "[Call][Initialize] for graph:%s failed", compute_graph->GetName().c_str());
  1032. return FAILED;
  1033. }
  1034. GE_TIMESTAMP_END(PartitionSubGraphInitialize, "GraphPartitioner::PartitionInitialize");
  1035. GE_TIMESTAMP_START(PartitionSubGraphMarkClusters);
  1036. MarkClusters();
  1037. GE_TIMESTAMP_END(PartitionSubGraphMarkClusters, "GraphPartitioner::PartitionMarkClusters");
  1038. GE_TIMESTAMP_START(PartitionSubGraphSplitSubGraphs);
  1039. if (SplitSubGraphs(compute_graph) != SUCCESS) {
  1040. GELOGE(FAILED, "[Split][SubGraphs] for graph:%s failed", compute_graph->GetName().c_str());
  1041. return FAILED;
  1042. }
  1043. GE_TIMESTAMP_END(PartitionSubGraphSplitSubGraphs, "GraphPartitioner::PartitionSplitSubGraphs");
  1044. GE_TIMESTAMP_START(PartitionSubGraphSortSubGraphs);
  1045. if (SortSubGraphs(compute_graph) != ge::SUCCESS) {
  1046. GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[Sort][SubGraphs] for graph:%s failed.",
  1047. compute_graph->GetName().c_str());
  1048. return ge::FAILED;
  1049. }
  1050. GE_TIMESTAMP_END(PartitionSubGraphSortSubGraphs, "GraphPartitioner::PartitionSortSubGraphs");
  1051. GE_TIMESTAMP_START(PartitionSubGraphAddPartitionsToGraphNode);
  1052. vector<ge::SubGraphInfoPtr> output_subgraphs;
  1053. if (AddPartitionsToGraphNode(output_subgraphs, compute_graph) != ge::SUCCESS) {
  1054. GELOGE(GE_GRAPH_EMPTY_PARTITION, "[Add][Partitions] To GraphNode failed, graph:%s.",
  1055. compute_graph->GetName().c_str());
  1056. return ge::FAILED;
  1057. }
  1058. GE_TIMESTAMP_END(PartitionSubGraphAddPartitionsToGraphNode, "GraphPartitioner::PartitionAddPartitionsToGraphNode");
  1059. GELOGI("Graph Partition ends. Adding partitions to SubGraphInfo, got %zu sub graphs", output_subgraphs.size());
  1060. graph_info_.mode_ = kMerging;
  1061. // do not care over flow
  1062. partition_times_++;
  1063. graph_2_graph_partition_info_[compute_graph] = graph_info_;
  1064. graph_2_subgraph_list_[compute_graph] = output_subgraphs;
  1065. return SUCCESS;
  1066. }
  1067. // all the inputs are the nodes and anchors in the original graph
  1068. Status ge::GraphPartitioner::AddPlaceHolderEnd(const AnchorPtr &out_anchor, const AnchorPtr &in_anchor) {
  1069. if ((out_anchor == nullptr) || (in_anchor == nullptr)) {
  1070. REPORT_INNER_ERROR("E19999", "Param out_anchor or in_anchor is nullptr, check invalid.");
  1071. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] out_anchor or in_anchor is nullptr.");
  1072. return FAILED;
  1073. }
  1074. // nodes in original graph
  1075. const auto &src_node = out_anchor->GetOwnerNode();
  1076. const auto &dst_node = in_anchor->GetOwnerNode();
  1077. if ((src_node == nullptr) || (dst_node == nullptr)) {
  1078. REPORT_INNER_ERROR("E19999", "in_anchor'node or out_anchor'node is nullptr. check invalid.");
  1079. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] src_node or dst_node is nullptr.");
  1080. return FAILED;
  1081. }
  1082. // All nodes have a copy in corresponding_node_in_partitions_, so function at can not be execption
  1083. auto src_anchor =
  1084. graph_info_.corresponding_node_in_partitions_.at(src_node)->GetOutAnchor(AnchorUtils::GetIdx(out_anchor));
  1085. auto dst_anchor =
  1086. graph_info_.corresponding_node_in_partitions_.at(dst_node)->GetInAnchor(AnchorUtils::GetIdx(in_anchor));
  1087. if ((src_anchor == nullptr) || (dst_anchor == nullptr)) {
  1088. REPORT_INNER_ERROR("E19999", "src_anchor(index:%d) or dst_anchor(index:%d) is nullptr.",
  1089. AnchorUtils::GetIdx(out_anchor), AnchorUtils::GetIdx(in_anchor));
  1090. GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] src_anchor(index:%d) or dst_anchor(index:%d) is nullptr.",
  1091. AnchorUtils::GetIdx(out_anchor), AnchorUtils::GetIdx(in_anchor));
  1092. return FAILED;
  1093. }
  1094. // anchors in subGraph
  1095. const ComputeGraphPtr &src_subgraph = src_anchor->GetOwnerNode()->GetOwnerComputeGraph();
  1096. const ComputeGraphPtr &dst_subgraph = dst_anchor->GetOwnerNode()->GetOwnerComputeGraph();
  1097. // add end and pld node
  1098. auto ret = AddPlaceHolderEndInSrcDstGraph(src_anchor, dst_anchor, dst_subgraph, src_subgraph);
  1099. if (ret != SUCCESS) {
  1100. GELOGE(GE_GRAPH_ADD_PLC_END_FAILED, "[Call][AddPlaceHolderEndInSrcDstGraph] failed, ret:%d.", ret);
  1101. return ret;
  1102. }
  1103. return SUCCESS;
  1104. }
  1105. Status ge::GraphPartitioner::SortSubGraphs(const ge::ComputeGraphPtr &compute_graph) {
  1106. uint32_t rank = kRankOne; // rank 0 for data graph
  1107. ComputeGraphPtr new_input_nodes_sub_graph = MakeShared<ComputeGraph>("inputNodeGraph");
  1108. if ((new_input_nodes_sub_graph == nullptr) || (compute_graph == nullptr)) {
  1109. REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr or Allocate Memory for ComputeGraph failed.");
  1110. GELOGE(FAILED, "[Check][Param] new_input_nodes_sub_graph or compute_graph is nullptr.");
  1111. return FAILED;
  1112. }
  1113. for (const auto &node : compute_graph->GetDirectNode()) {
  1114. // All nodes in original graph have a copy in corresponding_node_in_partitions_, so it can not be null
  1115. auto sub_graph = graph_info_.corresponding_node_in_partitions_.at(node)->GetOwnerComputeGraph();
  1116. if ((graph_info_.partitions_2_rank_.find(sub_graph) == graph_info_.partitions_2_rank_.end()) &&
  1117. (graph_info_.partitions_[sub_graph] != kEngineDefaultData)) {
  1118. graph_info_.partitions_2_rank_[sub_graph] = rank;
  1119. graph_info_.rank_2_partitions_.push_back(sub_graph);
  1120. rank++;
  1121. } else if (graph_info_.partitions_[sub_graph] == kEngineDefaultData) { // merge data graph
  1122. if (PutInputNodesInSubGraph(sub_graph, new_input_nodes_sub_graph) != SUCCESS) {
  1123. GELOGE(FAILED, "[Call][putInputNodesInSubGraph] failed.");
  1124. return FAILED;
  1125. }
  1126. auto to_be_del = graph_info_.partitions_.find(sub_graph);
  1127. graph_info_.partitions_.erase(to_be_del);
  1128. }
  1129. }
  1130. if (!new_input_nodes_sub_graph->GetDirectNode().empty()) {
  1131. graph_info_.rank_2_partitions_.insert(graph_info_.rank_2_partitions_.begin(), new_input_nodes_sub_graph);
  1132. graph_info_.partitions_2_rank_[new_input_nodes_sub_graph] = 0;
  1133. AddNewGraphToPartition(new_input_nodes_sub_graph, "inputNodesSubGraph");
  1134. }
  1135. // reinit rank
  1136. rank = kRankZero;
  1137. for (const auto &it : graph_info_.rank_2_partitions_) {
  1138. // rename subGraph based on rank
  1139. if (it != nullptr) {
  1140. // rename subGraph based on rank
  1141. string graph_name =
  1142. "partition" + std::to_string(partition_times_) + "_rank" + std::to_string(rank) + "_" + it->GetName();
  1143. it->SetName(graph_name);
  1144. }
  1145. rank++;
  1146. }
  1147. return SUCCESS;
  1148. }
  1149. AnchorPtr ge::GraphPartitioner::GetEndInAnchor(const AnchorPtr &src_anchor, const NodePtr &end_node) {
  1150. if ((src_anchor == nullptr) || (end_node == nullptr)) {
  1151. REPORT_INNER_ERROR("E19999", "Param src_anchor or end_node is nullptr, check invalid.");
  1152. GELOGE(FAILED, "[Check][Param] parameter src_anchor or end_node is nullptr.");
  1153. return nullptr;
  1154. }
  1155. AnchorPtr end_in_anchor;
  1156. if (Anchor::DynamicAnchorCast<OutDataAnchor>(src_anchor) != nullptr) {
  1157. end_in_anchor = end_node->GetInDataAnchor(0);
  1158. } else {
  1159. end_in_anchor = end_node->GetInControlAnchor();
  1160. }
  1161. return end_in_anchor;
  1162. }
  1163. AnchorPtr ge::GraphPartitioner::GetPldOutAnchor(const NodePtr &pld_node, const AnchorPtr &dst_anchor) {
  1164. if ((pld_node == nullptr) || (dst_anchor == nullptr)) {
  1165. REPORT_INNER_ERROR("E19999", "Param pld_node or dst_anchor is nullptr, check invalid.");
  1166. GELOGE(FAILED, "[Check][Param] parameter pld_node or dst_anchor is nullptr.");
  1167. return nullptr;
  1168. }
  1169. AnchorPtr pld_out_anchor;
  1170. if (Anchor::DynamicAnchorCast<InDataAnchor>(dst_anchor) != nullptr) {
  1171. pld_out_anchor = pld_node->GetOutDataAnchor(0);
  1172. } else {
  1173. pld_out_anchor = pld_node->GetOutControlAnchor();
  1174. }
  1175. return pld_out_anchor;
  1176. }
  1177. void ge::GraphPartitioner::AddEndPldInformationToSubGraphInfo(ge::SubGraphInfoPtr &subgraph_info) {
  1178. if (subgraph_info == nullptr) {
  1179. GELOGE(FAILED, "[Check][Param] parameter subgraph_info is nullptr.");
  1180. return;
  1181. }
  1182. auto subgraph = subgraph_info->GetSubGraph();
  1183. GE_CHECK_NOTNULL_JUST_RETURN(subgraph);
  1184. NodetoNodeMap end_map;
  1185. NodetoNodeMap pld_map;
  1186. for (const auto &node : subgraph->GetDirectNode()) {
  1187. if (node->GetType() == kEndType) {
  1188. end_map[node] = graph_info_.end_2_pld_.at(node);
  1189. }
  1190. if (node->GetType() == kPlaceHolderType) {
  1191. pld_map[node] = graph_info_.pld_2_end_.at(node);
  1192. }
  1193. }
  1194. subgraph_info->SetEnd2PldMap(end_map);
  1195. subgraph_info->SetPld2EndMap(pld_map);
  1196. }
  1197. const Graph2SubGraphInfoList &ge::GraphPartitioner::GetSubGraphMap() { return graph_2_subgraph_list_; }
  1198. void ge::GraphPartitioner::ClearAllPartitionData() {
  1199. graph_2_graph_partition_info_.clear();
  1200. graph_2_subgraph_list_.clear();
  1201. graph_2_input_subgraph_.clear();
  1202. GELOGD("Clear all partition data success.");
  1203. return;
  1204. }
  1205. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示