You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

inner_session.cc 26 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "session/inner_session.h"
  17. #include <map>
  18. #include <memory>
  19. #include <vector>
  20. #include "analyzer/analyzer.h"
  21. #include "adx_datadump_server.h"
  22. #include "common/dump/dump_properties.h"
  23. #include "common/dump/dump_manager.h"
  24. #include "framework/common/util.h"
  25. #include "framework/common/debug/ge_log.h"
  26. #include "graph/ge_context.h"
  27. #include "graph/ge_global_options.h"
  28. #include "graph/ge_local_context.h"
  29. #include "common/local_context.h"
  30. #include "graph/manager/graph_var_manager.h"
  31. #include "graph/manager/graph_mem_manager.h"
  32. #include "graph/utils/tensor_adapter.h"
  33. #include "runtime/mem.h"
  34. #include "ir_build/option_utils.h"
  35. #include "common/profiling/profiling_manager.h"
  36. #include "common/profiling/profiling_init.h"
  37. namespace ge {
  38. namespace {
  39. const int32_t kDumpStatus = 0;
  40. Status CheckReuseMemoryOption(const std::map<string, string> &options) {
  41. auto iter = options.find(OPTION_EXEC_DISABLE_REUSED_MEMORY);
  42. if (iter != options.end()) {
  43. if (iter->second == "0") {
  44. GELOGD("%s=0, reuse memory is open", OPTION_EXEC_DISABLE_REUSED_MEMORY);
  45. } else if (iter->second == "1") {
  46. GELOGD("%s=1, reuse memory is close", OPTION_EXEC_DISABLE_REUSED_MEMORY);
  47. } else {
  48. GELOGE(PARAM_INVALID, "[CheckReuse][MemoryOption]option %s=%s is invalid",
  49. OPTION_EXEC_DISABLE_REUSED_MEMORY, iter->second.c_str());
  50. REPORT_INNER_ERROR("E19999", "CheckReuseMemoryOption failed because option %s=%s is invalid.",
  51. OPTION_EXEC_DISABLE_REUSED_MEMORY, iter->second.c_str());
  52. return FAILED;
  53. }
  54. }
  55. return SUCCESS;
  56. }
  57. }
  58. static std::mutex mutex_; // BuildGraph and RunGraph use
  59. bool InnerSession::is_dump_server_inited_ = false;
  60. InnerSession::InnerSession(uint64_t session_id, const std::map<string, string> &options)
  61. : init_flag_(false), session_id_(session_id), options_(options) {}
  62. Status InnerSession::Initialize() {
  63. if (init_flag_) {
  64. GELOGW("[InnerSession:%lu] session already initialize.", session_id_);
  65. return SUCCESS;
  66. }
  67. // If the global options and the session options are duplicated, the session options is preferred.
  68. auto all_options = options_;
  69. all_options.insert(GetMutableGlobalOptions().begin(), GetMutableGlobalOptions().end());
  70. Status ret = CheckReuseMemoryOption(all_options);
  71. if (ret != SUCCESS) {
  72. GELOGE(ret, "[CheckReuse][MemoryOption] failed, [InnerSession:%lu].", session_id_);
  73. REPORT_CALL_ERROR("E19999", "CheckReuseMemoryOption failed, InnerSession=%lu.", session_id_);
  74. return ret;
  75. }
  76. //Check option OP_PRECISION_MODE
  77. auto iter = all_options.find(ge::OP_PRECISION_MODE);
  78. if (iter != all_options.end() && !iter->second.empty() && !ge::CheckInputPathValid(iter->second)) {
  79. REPORT_INPUT_ERROR("E10001", std::vector<std::string>({"parameter", "value", "reason"}),
  80. std::vector<std::string>({ge::OP_PRECISION_MODE, iter->second, "path is not found"}));
  81. GELOGE(PARAM_INVALID, "[Check][OP_PRECISION_MODE] %s not found", iter->second.c_str());
  82. return FAILED;
  83. }
  84. if (iter != all_options.end()) {
  85. GELOGI("Option set successfully, option_key=%s, option_value=%s",
  86. ge::OP_PRECISION_MODE.c_str(), iter->second.c_str());
  87. }
  88. // Check option modify_mixlist
  89. if (ge::CheckModifyMixlistParamValid(all_options) != ge::SUCCESS) {
  90. return FAILED;
  91. }
  92. UpdateThreadContext(std::map<std::string, std::string>{});
  93. // session device id set here
  94. std::string str_session_device_id;
  95. if (GetContext().GetOption("ge.session_device_id", str_session_device_id) == SUCCESS) {
  96. GELOGI("Option session device id has set, value is %s.", str_session_device_id.c_str());
  97. uint32_t session_device_id = 0;
  98. try {
  99. session_device_id = static_cast<uint32_t>(std::stoi(str_session_device_id.c_str()));
  100. // session device id has priority
  101. GetContext().SetCtxDeviceId(session_device_id);
  102. } catch (std::invalid_argument &) {
  103. GELOGW("session device id %s transform to int failed.", str_session_device_id.c_str());
  104. } catch (std::out_of_range &) {
  105. GELOGW("session device id %s transform to int failed.", str_session_device_id.c_str());
  106. }
  107. }
  108. GE_CHK_RT_RET(rtSetDevice(GetContext().DeviceId()));
  109. DumpProperties dump_properties;
  110. GE_CHK_STATUS_RET(dump_properties.InitByOptions(), "Init dump properties failed.");
  111. GE_CHK_STATUS_RET(AddDumpProperties(dump_properties), "[Add][DumpProperties] failed.");
  112. ret = InnerInitialize();
  113. if (ret != SUCCESS) {
  114. GELOGE(ret, "[Init][GraphManager] failed, InnerSession:%lu.", session_id_);
  115. REPORT_CALL_ERROR("E19999", "GraphManager initialize failed, InnerSession:%lu.", session_id_);
  116. GE_CHK_STATUS(RemoveDumpProperties(), "[Remove][DumpProperties] failed.");
  117. return ret;
  118. }
  119. ret = VarManager::Instance(session_id_)->SetMemoryMallocSize(all_options);
  120. if (ret != SUCCESS) {
  121. GELOGE(ret, "[Set][MemoryMallocSize] failed.");
  122. REPORT_CALL_ERROR("E19999", "VarManager SetMemoryMallocSize failed, InnerSession:%lu.", session_id_);
  123. (void)InnerFinalize();
  124. GE_CHK_STATUS(RemoveDumpProperties(), "[Remove][DumpProperties] failed.");
  125. GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));
  126. return ret;
  127. }
  128. int32_t version = static_cast<int32_t>(SessionVersion::ClOUD_VERSION);
  129. const int DEFAULT_DEVICE_ID = 0;
  130. const int DEFAULT_JOB_ID = 0;
  131. ret = VarManager::Instance(session_id_)->Init(version, session_id_, DEFAULT_DEVICE_ID, DEFAULT_JOB_ID);
  132. if (ret != SUCCESS) {
  133. GELOGE(ret, "[Init][VarManager] failed.");
  134. REPORT_CALL_ERROR("E19999", "VarManager init failed, InnerSession:%lu.", session_id_);
  135. GE_CHK_STATUS(RemoveDumpProperties(), "[Remove][DumpProperties] failed.");
  136. }
  137. init_flag_ = true;
  138. return SUCCESS;
  139. }
  140. Status InnerSession::Finalize() {
  141. std::lock_guard<std::mutex> lock(resource_mutex_);
  142. if (!init_flag_) {
  143. GELOGW("[InnerSession:%lu] session does not initialize.", session_id_);
  144. return SUCCESS;
  145. }
  146. UpdateThreadContext(std::map<std::string, std::string>{});
  147. Status ret = InnerFinalize();
  148. if (ret != SUCCESS) {
  149. // Subsequent code execution is required, so no return is required
  150. GELOGE(ret, "[Finalize][GraphManager] failed, InnerSession:%lu.", session_id_);
  151. REPORT_CALL_ERROR("E19999", "GraphManager Finalize failed, InnerSession:%lu.", session_id_);
  152. }
  153. init_flag_ = false;
  154. // release var memory
  155. GELOGI("VarManager free var memory.");
  156. (void)VarManager::Instance(session_id_)->FreeVarMemory();
  157. for (auto memory_type : MemManager::Instance().GetAllMemoryType()) {
  158. (void)MemManager::Instance().SessionScopeMemInstance(memory_type).Free(session_id_);
  159. }
  160. // release analyzer saved info(Session Level)
  161. Analyzer::GetInstance()->DestroySessionJsonObject(session_id_);
  162. GE_CHK_RT(rtDeviceReset(static_cast<int32_t>(GetContext().DeviceId())));
  163. GE_CHK_STATUS_RET(RemoveDumpProperties(), "[Remove][DumpProperties] failed.");
  164. return ret;
  165. }
  166. Status InnerSession::InnerInitialize() {
  167. Status ret = model_executor_.Initialize(options_, session_id_);
  168. if (ret != SUCCESS) {
  169. GELOGE(ret, "[Init][GraphExecutor] failed, InnerSession:%lu.", session_id_);
  170. REPORT_CALL_ERROR("E19999", "GraphExecutor initialize failed, InnerSession:%lu.", session_id_);
  171. GE_CHK_STATUS(RemoveDumpProperties(), "[Remove][DumpProperties] failed.");
  172. return ret;
  173. }
  174. ret = graph_manager_.Initialize(options_, &model_executor_);
  175. if (ret != SUCCESS) {
  176. GELOGE(ret, "[Init][GraphManager] failed, InnerSession:%lu.", session_id_);
  177. REPORT_CALL_ERROR("E19999", "GraphManager initialize failed, InnerSession:%lu.", session_id_);
  178. GE_CHK_STATUS(RemoveDumpProperties(), "[Remove][DumpProperties] failed.");
  179. return ret;
  180. }
  181. return SUCCESS;
  182. }
  183. Status InnerSession::InnerFinalize() {
  184. Status ret = graph_manager_.Finalize();
  185. if (ret != SUCCESS) {
  186. // Subsequent code execution is required, so no return is required
  187. GELOGE(ret, "[Finalize][GraphManager] failed, InnerSession:%lu.", session_id_);
  188. REPORT_CALL_ERROR("E19999", "GraphManager Finalize failed, InnerSession:%lu.", session_id_);
  189. }
  190. ret = model_executor_.Finalize();
  191. if (ret != SUCCESS) {
  192. // Subsequent code execution is required, so no return is required
  193. GELOGE(ret, "[Finalize][GraphExecutor] failed, InnerSession:%lu.", session_id_);
  194. REPORT_CALL_ERROR("E19999", "GraphExecutor Finalize failed, InnerSession:%lu.", session_id_);
  195. }
  196. return SUCCESS;
  197. }
  198. Status InnerSession::GetVariable(const std::string &name, Tensor &val) {
  199. UpdateThreadContext(std::map<std::string, std::string>{});
  200. return graph_manager_.GetVariable(name, val);
  201. }
  202. Status InnerSession::AddGraph(uint32_t graph_id, const Graph &graph) {
  203. std::map<std::string, std::string> options;
  204. auto device_id = GetContext().DeviceId();
  205. GELOGD("Device id is %u", device_id);
  206. ProfilingManager::Instance().SetGraphIdToDeviceMap(graph_id, device_id);
  207. return AddGraph(graph_id, graph, options);
  208. }
  209. Status InnerSession::AddGraph(uint32_t graph_id, const Graph &graph,
  210. const std::map<std::string, std::string> &options) {
  211. std::lock_guard<std::mutex> lock(resource_mutex_);
  212. if (!init_flag_) {
  213. GELOGE(GE_SESS_INIT_FAILED, "[Add][Graph] failed because GraphManager not init, InnerSession:%lu, graph_id:%u.",
  214. session_id_, graph_id);
  215. REPORT_INNER_ERROR("E19999", "AddGraph failed because GraphManager not init, InnerSession:%lu, graph_id:%u.",
  216. session_id_, graph_id);
  217. return GE_SESS_INIT_FAILED;
  218. }
  219. UpdateThreadContext(options);
  220. Status ret = graph_manager_.AddGraph(graph_id, graph, options, domi::GetContext());
  221. if (ret != SUCCESS) {
  222. GELOGE(ret, "[Add][Graph] failed, InnerSession:%lu graphid: %u.", session_id_, graph_id);
  223. REPORT_CALL_ERROR("E19999", "GraphManager AddGraph failed, InnerSession:%lu graphid: %u.", session_id_, graph_id);
  224. return ret;
  225. }
  226. GELOGI("[InnerSession:%lu] add graph success, graph_id=%u.", session_id_, graph_id);
  227. return SUCCESS;
  228. }
  229. Status InnerSession::AddGraphWithCopy(uint32_t graph_id, const Graph &graph,
  230. const std::map<std::string, std::string> &options) {
  231. std::lock_guard<std::mutex> lock(resource_mutex_);
  232. if (!init_flag_) {
  233. GELOGE(GE_SESS_INIT_FAILED, "[Add][Graph] failed because GraphManager not init, InnerSession:%lu, graph_id:%u.",
  234. session_id_, graph_id);
  235. REPORT_INNER_ERROR("E19999",
  236. "AddGraphWithCopy failed because GraphManager not init, InnerSession:%lu, graph_id:%u.",
  237. session_id_, graph_id);
  238. return GE_SESS_INIT_FAILED;
  239. }
  240. UpdateThreadContext(options);
  241. Status ret = graph_manager_.AddGraphWithCopy(graph_id, graph, options, domi::GetContext());
  242. if (ret != SUCCESS) {
  243. GELOGE(ret, "[Add][Graph] failed, InnerSession:%lu graphid: %u.", session_id_, graph_id);
  244. REPORT_CALL_ERROR("E19999",
  245. "GraphManager AddGraphWithCopy failed, InnerSession:%lu graphid: %u.", session_id_, graph_id);
  246. return ret;
  247. }
  248. GELOGI("[InnerSession:%lu] add graph success, graph_id=%u.", session_id_, graph_id);
  249. return SUCCESS;
  250. }
  251. Status InnerSession::RunGraph(uint32_t graph_id, const std::vector<Tensor> &inputs, std::vector<Tensor> &outputs) {
  252. GELOGI("[InnerSession:%lu] run graph on session, graph_id=%u.", session_id_, graph_id);
  253. if (mutex_.try_lock()) {
  254. std::lock_guard<std::mutex> lock(mutex_, std::adopt_lock);
  255. auto device_id = GetContext().DeviceId();
  256. GELOGD("device is is %u", device_id);
  257. ProfilingInit::Instance().SetDeviceIdByModelId(graph_id, device_id);
  258. if (!init_flag_) {
  259. GELOGE(GE_SESS_INIT_FAILED, "[Run][Graph]failed because GraphManager not Init, InnerSession:%lu, graph_id:%u.",
  260. session_id_, graph_id);
  261. REPORT_INNER_ERROR("E19999", "RunGraph failed because GraphManager not Init, InnerSession:%lu, graph_id:%u.",
  262. session_id_, graph_id);
  263. return GE_SESS_INIT_FAILED;
  264. }
  265. UpdateThreadContext(graph_id);
  266. vector<GeTensor> geInputs;
  267. for (auto &item : inputs) {
  268. geInputs.push_back(TensorAdapter::AsGeTensor(item));
  269. }
  270. vector<GeTensor> geOutputs;
  271. Status ret = graph_manager_.RunGraph(graph_id, geInputs, geOutputs, session_id_);
  272. domi::GetContext().out_nodes_map.clear();
  273. domi::GetContext().user_out_nodes.clear();
  274. if (ret != SUCCESS) {
  275. GELOGE(ret, "[Run][Graph]failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id);
  276. REPORT_CALL_ERROR("E19999",
  277. "GraphManager RunGraph failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id);
  278. return ret;
  279. }
  280. outputs.clear();
  281. for (auto &item : geOutputs) {
  282. outputs.push_back(TensorAdapter::AsTensor(item));
  283. }
  284. GELOGI("[InnerSession:%lu] run graph success, graph_id=%u.", session_id_, graph_id);
  285. return SUCCESS;
  286. } else {
  287. GELOGE(GE_SESS_ALREADY_RUNNING, "[Run][Graph]failed, InnerSession:%lu, graph_id=%u.", session_id_, graph_id);
  288. REPORT_INNER_ERROR("E19999",
  289. "RunGraph failed because mutex try_lock false, InnerSession:%lu, graph_id=%u.",
  290. session_id_, graph_id);
  291. return GE_SESS_ALREADY_RUNNING;
  292. }
  293. }
  294. Status InnerSession::RunGraphWithStreamAsync(uint32_t graph_id, rtStream_t stream,
  295. const std::vector<Tensor> &inputs, std::vector<Tensor> &outputs) {
  296. GELOGI("Run graph with stream, session id = %lu, graph id = %u, stream = %p.",
  297. session_id_, graph_id, stream);
  298. if (mutex_.try_lock()) {
  299. std::lock_guard<std::mutex> lock(mutex_, std::adopt_lock);
  300. if (!init_flag_) {
  301. GELOGE(GE_SESS_INIT_FAILED, "[Run][GraphWithStream]failed because GraphManager not Init,"
  302. "session id = %lu, graph id = %u, stream = %p.", session_id_, graph_id, stream);
  303. REPORT_INNER_ERROR("E19999", "RunGraphWithStreamAsync failed because GraphManager not Init,"
  304. "session id = %lu, graph id = %u, stream = %p.", session_id_, graph_id, stream);
  305. return GE_SESS_INIT_FAILED;
  306. }
  307. auto device_id = GetContext().DeviceId();
  308. GELOGD("device id is %u", device_id);
  309. ProfilingInit::Instance().SetDeviceIdByModelId(graph_id, device_id);
  310. UpdateThreadContext(graph_id);
  311. vector<GeTensor> ge_inputs;
  312. for (auto &item : inputs) {
  313. ge_inputs.emplace_back(TensorAdapter::AsGeTensor(item));
  314. }
  315. vector<GeTensor> ge_outputs;
  316. for (auto &item : outputs) {
  317. ge_outputs.emplace_back(TensorAdapter::AsGeTensor(item));
  318. }
  319. Status ret = graph_manager_.RunGraphWithStreamAsync(graph_id, stream, session_id_, ge_inputs, ge_outputs);
  320. domi::GetContext().out_nodes_map.clear();
  321. domi::GetContext().user_out_nodes.clear();
  322. if (ret != SUCCESS) {
  323. GELOGE(ret, "[Run][GraphWithStreamAsync]failed,"
  324. "session id = %lu, graph id = %u, stream = %p.", session_id_, graph_id, stream);
  325. REPORT_CALL_ERROR("E19999", "GraphManager RunGrapWithStreamhAsync failed,"
  326. "session id = %lu, graph id = %u, stream = %p.", session_id_, graph_id, stream);
  327. return ret;
  328. }
  329. GELOGI("Run graph with stream success, session id = %lu, graph id = %u, stream = %p.",
  330. session_id_, graph_id, stream);
  331. return SUCCESS;
  332. } else {
  333. GELOGE(GE_SESS_ALREADY_RUNNING, "[Run][GraphWithStreamAsync]failed because mutex try_lock false,"
  334. "session id = %lu, graph id = %u, stream = %p.", session_id_, graph_id, stream);
  335. REPORT_INNER_ERROR("E19999", "[Run][GraphWithStreamAsync]failed failed because mutex try_lock false,"
  336. "session id = %lu, graph id = %u, stream = %p.", session_id_, graph_id, stream);
  337. return GE_SESS_ALREADY_RUNNING;
  338. }
  339. }
  340. Status InnerSession::RemoveGraph(uint32_t graph_id) {
  341. std::lock_guard<std::mutex> lock(resource_mutex_);
  342. if (!init_flag_) {
  343. GELOGE(GE_SESS_INIT_FAILED,
  344. "[Remove][Graph] failed because GraphManager not init, InnerSession:%lu, graph_id=%u.",
  345. session_id_, graph_id);
  346. REPORT_INNER_ERROR("E19999",
  347. "RemoveGraph failed, because GraphManager not init, InnerSession:%lu, graph_id=%u.",
  348. session_id_, graph_id);
  349. return GE_SESS_INIT_FAILED;
  350. }
  351. auto device_id = GetContext().DeviceId();
  352. GELOGD("remove device id %u", device_id);
  353. ProfilingInit::Instance().UnsetDeviceIdByModelId(graph_id, device_id);
  354. UpdateThreadContext(graph_id);
  355. Status ret = graph_manager_.RemoveGraph(graph_id);
  356. if (ret != SUCCESS) {
  357. GELOGE(ret, "[Remove][Graph] failed, InnerSession:%lu, graph_id=%u.", session_id_, graph_id);
  358. REPORT_CALL_ERROR("E19999",
  359. "GraphManager RemoveGraph failed, InnerSession:%lu, graph_id=%u.", session_id_, graph_id);
  360. return ret;
  361. }
  362. GELOGI("[InnerSession:%lu] remove graph success, graph_id=%u.", session_id_, graph_id);
  363. return SUCCESS;
  364. }
  365. Status InnerSession::RegisterCallBackFunc(
  366. const std::string &key,
  367. const std::function<Status(uint32_t, const std::map<std::string, ge::Tensor> &)> &callback) {
  368. std::lock_guard<std::mutex> lock(resource_mutex_);
  369. if (!init_flag_) {
  370. GELOGE(GE_SESS_INIT_FAILED,
  371. "[Register][CallBackFunc] failed because GraphManager not initialize, InnerSession:%lu.", session_id_);
  372. REPORT_INNER_ERROR("E19999",
  373. "RegisterCallBackFunc failed because GraphManager not init, InnerSession:%lu.", session_id_);
  374. return GE_SESS_INIT_FAILED;
  375. }
  376. UpdateThreadContext(std::map<std::string, std::string>{});
  377. Status ret = graph_manager_.RegisterCallBackFunc(key, callback);
  378. if (ret != SUCCESS) {
  379. GELOGE(ret, "[Register][CallBackFunc] failed, InnerSession:%lu register %s.", session_id_, key.c_str());
  380. REPORT_CALL_ERROR("E19999",
  381. "GraphManager RegisterCallBackFunc failed, InnerSession:%lu register %s.",
  382. session_id_, key.c_str());
  383. return ret;
  384. }
  385. GELOGI("[InnerSession:%lu] register %s callback function success.", session_id_, key.c_str());
  386. return SUCCESS;
  387. }
  388. Status InnerSession::RegisterCallBackFunc(
  389. const std::string &key,
  390. const std::function<Status(uint32_t, const std::map<AscendString, ge::Tensor> &)> &callback) {
  391. std::lock_guard<std::mutex> lock(resource_mutex_);
  392. if (!init_flag_) {
  393. GELOGE(GE_SESS_INIT_FAILED,
  394. "[Register][CallBackFunc]failed because GraphManager not initialize, InnerSession:%lu.", session_id_);
  395. REPORT_INNER_ERROR("E19999",
  396. "RegisterCallBackFunc failed because GraphManager not initialize, InnerSession:%lu.",
  397. session_id_);
  398. return GE_SESS_INIT_FAILED;
  399. }
  400. UpdateThreadContext(std::map<std::string, std::string>{});
  401. Status ret = graph_manager_.RegisterCallBackFunc(key, callback);
  402. if (ret != SUCCESS) {
  403. GELOGE(ret, "[Register][CallBackFunc] failed, InnerSession:%lu register %s.", session_id_, key.c_str());
  404. REPORT_CALL_ERROR("E19999",
  405. "GraphManager RegisterCallBackFunc failed, InnerSession:%lu register %s.",
  406. session_id_, key.c_str());
  407. return ret;
  408. }
  409. GELOGI("[InnerSession:%lu] register %s callback function success.", session_id_, key.c_str());
  410. return SUCCESS;
  411. }
  412. Status InnerSession::BuildGraph(uint32_t graph_id, const std::vector<InputTensorInfo> &inputs) {
  413. UpdateThreadContext(graph_id);
  414. GELOGI("[InnerSession:%lu] build graph on session, graph_id=%u.", session_id_, graph_id);
  415. std::vector<ge::GeTensor> ge_inputs;
  416. for (auto const &input : inputs) {
  417. std::vector<int64_t> input_dims;
  418. std::transform(input.dims.begin(), input.dims.end(), std::back_inserter(input_dims),
  419. [](int64_t x) -> int64_t { return x; });
  420. GeShape input_shape(input_dims);
  421. GeTensorDesc input_tensor_desc;
  422. input_tensor_desc.SetShape(input_shape);
  423. input_tensor_desc.SetDataType(static_cast<ge::DataType>(input.data_type));
  424. ge_inputs.emplace_back(input_tensor_desc);
  425. }
  426. GeRootModelPtr ge_root_model = nullptr;
  427. Status ret = graph_manager_.BuildGraph(graph_id, ge_inputs, ge_root_model, session_id_, true);
  428. if (ret != SUCCESS) {
  429. GELOGE(ret, "[Build][Graph] failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id);
  430. REPORT_CALL_ERROR("E19999",
  431. "GraphManager BuildGraph failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id);
  432. return ret;
  433. }
  434. GELOGI("[InnerSession:%lu] build graph success, graph_id=%u.", session_id_, graph_id);
  435. return ret;
  436. }
  437. Status InnerSession::BuildGraph(uint32_t graph_id, const std::vector<ge::Tensor> &inputs) {
  438. UpdateThreadContext(graph_id);
  439. GELOGI("[InnerSession:%lu] build graph on session, graph_id=%u.", session_id_, graph_id);
  440. std::vector<ge::GeTensor> ge_inputs;
  441. for (const auto &input : inputs) {
  442. ge_inputs.emplace_back(TensorAdapter::AsGeTensor(input));
  443. }
  444. GeRootModelPtr ge_root_model = nullptr;
  445. Status ret = graph_manager_.BuildGraph(graph_id, ge_inputs, ge_root_model, session_id_, true);
  446. if (ret != SUCCESS) {
  447. GELOGE(ret, "[Build][Graph] failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id);
  448. REPORT_CALL_ERROR("E19999",
  449. "GraphManager BuildGraph failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id);
  450. return ret;
  451. }
  452. GELOGI("[InnerSession:%lu] build graph success, graph_id=%u.", session_id_, graph_id);
  453. return ret;
  454. }
  455. Status InnerSession::RunGraphAsync(uint32_t graph_id, const std::vector<ge::Tensor> &inputs,
  456. RunAsyncCallback callback) {
  457. UpdateThreadContext(graph_id);
  458. GELOGI("[InnerSession:%lu] run graph on session, graph_id=%u.", session_id_, graph_id);
  459. Status ret = graph_manager_.RunGraphAsync(graph_id, inputs, session_id_, callback);
  460. if (ret != SUCCESS) {
  461. GELOGE(ret, "[Run][GraphAsync]failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id);
  462. REPORT_CALL_ERROR("E19999",
  463. "GraphManager RunGraphAsync failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id);
  464. return ret;
  465. }
  466. GELOGI("[InnerSession:%lu] run graph success, graph_id=%u.", session_id_, graph_id);
  467. return ret;
  468. }
  469. const GraphManager &InnerSession::getGraphManagerObj() const { return graph_manager_; }
  470. void InnerSession::UpdateThreadContext(const std::map<std::string, std::string> &options) {
  471. GetThreadLocalContext().SetGlobalOption(GetMutableGlobalOptions());
  472. GetThreadLocalContext().SetSessionOption(options_);
  473. GetThreadLocalContext().SetGraphOption(options);
  474. GetContext().SetSessionId(session_id_);
  475. SetRtSocVersion();
  476. }
  477. void InnerSession::UpdateThreadContext(uint32_t graph_id) {
  478. auto options = graph_manager_.GetGraphOptions(graph_id);
  479. if (options == nullptr) {
  480. GELOGW("graph level options is null.");
  481. UpdateThreadContext(std::map<std::string, std::string>{});
  482. } else {
  483. UpdateThreadContext(*options);
  484. }
  485. }
  486. bool InnerSession::IsGraphNeedRebuild(uint32_t graph_id) {
  487. UpdateThreadContext(graph_id);
  488. return graph_manager_.IsGraphNeedRebuild(graph_id);
  489. }
  490. Status InnerSession::GetAllVariables(std::map<std::string, GeTensorDesc> &all_variables) {
  491. return VarManager::Instance(session_id_)->GetAllVariables(all_variables);
  492. }
  493. Status InnerSession::GenCheckPointGraph(const std::map<std::string, GeTensorDesc> &all_variables, Graph &graph) {
  494. return graph_manager_.GenCheckPointGraph(all_variables, graph);
  495. }
  496. Status InnerSession::SaveVariables(const Graph &graph, const std::vector<std::string> &var_names,
  497. const std::vector<Tensor> &outputs, std::vector<Tensor> &var_values) {
  498. return graph_manager_.SaveVariables(graph, var_names, outputs, var_values);
  499. }
  500. Status InnerSession::AddDumpProperties(const DumpProperties &dump_properties) {
  501. if (!is_dump_server_inited_) {
  502. if (dump_properties.IsDumpOpen() || dump_properties.IsOpDebugOpen()) {
  503. GE_IF_BOOL_EXEC(AdxDataDumpServerInit() != kDumpStatus,
  504. GELOGE(PARAM_INVALID, "[Init][AdxDataDumpServer] failed, session_id:%lu.", session_id_);
  505. return PARAM_INVALID)
  506. GELOGI("Init adx data dump server success");
  507. is_dump_server_inited_ = true;
  508. }
  509. }
  510. DumpManager::GetInstance().AddDumpProperties(session_id_, dump_properties);
  511. return SUCCESS;
  512. }
  513. Status InnerSession::RemoveDumpProperties() {
  514. DumpManager::GetInstance().RemoveDumpProperties(session_id_);
  515. if (is_dump_server_inited_ && DumpManager::GetInstance().GetDumpPropertiesMap().empty()) {
  516. GE_IF_BOOL_EXEC(AdxDataDumpServerUnInit() != kDumpStatus,
  517. GELOGE(PARAM_INVALID, "[UnInit][AdxDataDumpServer] failed, session_id:%lu.", session_id_);
  518. REPORT_INNER_ERROR("E19999", "RemoveDumpProperties failed because AdxDataDumpServerUnInit failed,"
  519. "session_id:%lu", session_id_);
  520. return PARAM_INVALID)
  521. GELOGI("UnInit adx data dump server success");
  522. is_dump_server_inited_ = false;
  523. }
  524. return SUCCESS;
  525. }
  526. void InnerSession::SetRtSocVersion() {
  527. const auto &global_options = GetMutableGlobalOptions();
  528. auto it = global_options.find(ge::SOC_VERSION);
  529. if (it != global_options.end()) {
  530. const char *soc_version = it->second.c_str();
  531. rtError_t rt_ret = rtSetSocVersion(soc_version);
  532. if (rt_ret != RT_ERROR_NONE) {
  533. GELOGW("Set soc version %s failed. ret:0x%X", soc_version, rt_ret);
  534. }
  535. GELOGI("Set soc version %s success.", soc_version);
  536. }
  537. }
  538. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示