You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ge_prof.cc 13 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "ge/ge_prof.h"
  17. #include "ge/ge_api.h"
  18. #include "init/gelib.h"
  19. #include "common/debug/log.h"
  20. #include "framework/common/debug/ge_log.h"
  21. #include "common/profiling/profiling_manager.h"
  22. #include "graph/load/graph_loader.h"
  23. #include "toolchain/prof_acl_api.h"
  24. using std::map;
  25. using std::string;
  26. using std::vector;
  27. namespace {
  28. const uint32_t kMaxDeviceNum = 64;
  29. const uint32_t kDeviceListIndex = 3;
  30. const std::string kProfilingInit = "prof_init";
  31. const std::string kProfilingFinalize = "prof_finalize";
  32. const std::string kProfilingStart = "prof_start";
  33. const std::string kProfilingStop = "prof_stop";
  34. const std::string kDeviceNums = "devNums";
  35. const std::string kDeviceIdList = "devIdList";
  36. const std::string kAicoreMetrics = "aicoreMetrics";
  37. const std::map<ge::ProfilingAicoreMetrics, std::string> kProfAicoreMetricsToString = {
  38. <<<<<<< HEAD:ge/client/ge_prof.cc
  39. {ge::kAicoreArithmaticThroughput, "AICORE_ARITHMATIC_THROUGHPUT"},
  40. {ge::kAicorePipeline, "AICORE_PIPELINE"},
  41. {ge::kAicoreSynchronization, "AICORE_SYNCHRONIZATION"},
  42. {ge::kAicoreMemory, "AICORE_MEMORY"},
  43. {ge::kAicoreInternalMemory, "AICORE_INTERNAL_MEMORY"},
  44. {ge::kAicoreStall, "AICORE_STALL"}};
  45. =======
  46. {ge::kAicoreArithmaticThroughput, "AICORE_ARITHMATIC_THROUGHPUT"},
  47. {ge::kAicorePipeline, "AICORE_PIPELINE"},
  48. {ge::kAicoreSynchronization, "AICORE_SYNCHRONIZATION"},
  49. {ge::kAicoreMemory, "AICORE_MEMORY"},
  50. {ge::kAicoreInternalMemory, "AICORE_INTERNAL_MEMORY"},
  51. {ge::kAicoreStall, "AICORE_STALL"}};
  52. >>>>>>> cd365aa247c64e30487d1e71e4f724a889848f80:src/ge/client/ge_prof.cc
  53. } // namespace
  54. static bool g_graph_prof_init_ = false;
  55. static std::mutex g_prof_mutex_;
  56. namespace ge {
  57. struct aclgrphProfConfig {
  58. ProfConfig config;
  59. };
  60. Status aclgrphProfInit(const char *profiler_path, uint32_t length) {
  61. GELOGT(TRACE_INIT, "Graph prof init start");
  62. std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  63. if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
  64. GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
  65. return FAILED;
  66. }
  67. std::lock_guard<std::mutex> lock(g_prof_mutex_);
  68. if (g_graph_prof_init_) {
  69. GELOGW("Multi graph profiling initializations.");
  70. return GE_PROF_MULTI_INIT;
  71. }
  72. Status ret = CheckPath(profiler_path, length);
  73. if (ret != SUCCESS) {
  74. GELOGE(ret, "Profiling config path is invalid.");
  75. return ret;
  76. }
  77. // if command mode is set, just return
  78. if (ProfilingManager::Instance().ProfilingOn()) {
  79. GELOGW("Graph prof init failed, cause profiling command pattern is running.");
  80. return GE_PROF_MODE_CONFLICT;
  81. }
  82. ret = ProfInit(profiler_path);
  83. if (ret != SUCCESS) {
  84. GELOGE(ret, "ProfInit init fail");
  85. return ret;
  86. }
  87. GraphLoader graph_loader;
  88. Command command;
  89. command.cmd_params.clear();
  90. command.cmd_type = kProfilingInit;
  91. command.module_index = PROF_MODEL_LOAD;
  92. ret = graph_loader.CommandHandle(command);
  93. if (ret != SUCCESS) {
  94. GELOGE(ret, "Handle profiling command %s failed, config = %s", kProfilingInit.c_str(), profiler_path);
  95. return ret;
  96. }
  97. if (!g_graph_prof_init_) {
  98. g_graph_prof_init_ = true;
  99. GELOGI("Profiling init successfully.");
  100. }
  101. GELOGI("Successfully execute GraphProfInit.");
  102. return SUCCESS;
  103. }
  104. Status aclgrphProfFinalize() {
  105. std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  106. if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
  107. GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
  108. return FAILED;
  109. }
  110. std::lock_guard<std::mutex> lock(g_prof_mutex_);
  111. // if command mode is set, just return
  112. if (ProfilingManager::Instance().ProfilingOn()) {
  113. GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
  114. return GE_PROF_MODE_CONFLICT;
  115. }
  116. if (!g_graph_prof_init_) {
  117. GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
  118. return GE_PROF_NOT_INIT;
  119. }
  120. GraphLoader graph_loader;
  121. Command command;
  122. command.cmd_params.clear();
  123. command.cmd_type = kProfilingFinalize;
  124. Status ret = graph_loader.CommandHandle(command);
  125. if (ret != SUCCESS) {
  126. GELOGE(ret, "Handle profiling command %s failed.", kProfilingFinalize.c_str());
  127. return ret;
  128. }
  129. ret = ProfFinalize();
  130. if (ret != SUCCESS) {
  131. GELOGE(ret, "Finalize profiling failed, result = %d", ret);
  132. }
  133. if (ret == SUCCESS) {
  134. g_graph_prof_init_ = false;
  135. GELOGI("Successfully execute GraphProfFinalize.");
  136. }
  137. return ret;
  138. }
  139. bool TransProfConfigToParam(const aclgrphProfConfig *profiler_config, vector<string> &prof_config_params) {
  140. prof_config_params.clear();
  141. prof_config_params.emplace_back(kDeviceNums);
  142. prof_config_params.emplace_back(std::to_string(profiler_config->config.devNums));
  143. prof_config_params.emplace_back(kDeviceIdList);
  144. std::string devID = "";
  145. if (profiler_config->config.devNums == 0) {
  146. GELOGW("The device num is invalid.");
  147. return false;
  148. }
  149. for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
  150. devID.append(std::to_string(profiler_config->config.devIdList[i]));
  151. if (i != profiler_config->config.devNums - 1) {
  152. devID.append(",");
  153. }
  154. }
  155. prof_config_params.push_back(devID);
  156. prof_config_params.push_back(kAicoreMetrics);
  157. auto iter =
  158. kProfAicoreMetricsToString.find(static_cast<ProfilingAicoreMetrics>(profiler_config->config.aicoreMetrics));
  159. if (iter == kProfAicoreMetricsToString.end()) {
  160. GELOGW("The prof aicore metrics is invalid.");
  161. return false;
  162. }
  163. prof_config_params.push_back(iter->second);
  164. return true;
  165. }
  166. bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
  167. if (deviceid_list == nullptr) {
  168. GELOGE(PARAM_INVALID, "deviceIdList is nullptr");
  169. return false;
  170. }
  171. if (device_nums == 0 || device_nums > kMaxDeviceNum) {
  172. GELOGE(PARAM_INVALID, "The device nums is invalid.");
  173. return false;
  174. }
  175. // real device num
  176. int32_t dev_count = 0;
  177. rtError_t rt_err = rtGetDeviceCount(&dev_count);
  178. if (rt_err != RT_ERROR_NONE) {
  179. GELOGE(INTERNAL_ERROR, "Get the Device count fail.");
  180. return false;
  181. }
  182. if (device_nums > static_cast<uint32_t>(dev_count)) {
  183. GELOGE(PARAM_INVALID, "Device num(%u) is not in range 1 ~ %d.", device_nums, dev_count);
  184. return false;
  185. }
  186. std::unordered_set<uint32_t> record;
  187. for (size_t i = 0; i < device_nums; ++i) {
  188. uint32_t dev_id = deviceid_list[i];
  189. if (dev_id >= static_cast<uint32_t>(dev_count)) {
  190. GELOGE(PARAM_INVALID, "Device id %u is not in range 0 ~ %d(exclude %d)", dev_id, dev_count, dev_count);
  191. return false;
  192. }
  193. if (record.count(dev_id) > 0) {
  194. GELOGE(PARAM_INVALID, "Device id %u is duplicatedly set", dev_id);
  195. return false;
  196. }
  197. record.insert(dev_id);
  198. }
  199. return true;
  200. }
  201. aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
  202. ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
  203. uint64_t data_type_config) {
  204. if (!isProfConfigValid(deviceid_list, device_nums)) {
  205. return nullptr;
  206. }
  207. aclgrphProfConfig *config = new (std::nothrow) aclgrphProfConfig();
  208. if (config == nullptr) {
  209. GELOGE(INTERNAL_ERROR, "new aclgrphProfConfig fail");
  210. return nullptr;
  211. }
  212. config->config.devNums = device_nums;
  213. if (memcpy_s(config->config.devIdList, sizeof(config->config.devIdList), deviceid_list,
  214. device_nums * sizeof(uint32_t)) != EOK) {
  215. GELOGE(INTERNAL_ERROR, "copy devID failed. size = %u", device_nums);
  216. delete config;
  217. return nullptr;
  218. }
  219. config->config.aicoreMetrics = static_cast<ProfAicoreMetrics>(aicore_metrics);
  220. config->config.dataTypeConfig = data_type_config;
  221. GELOGI("Successfully create prof config.");
  222. return config;
  223. }
  224. Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config) {
  225. if (profiler_config == nullptr) {
  226. GELOGE(PARAM_INVALID, "destroy profilerConfig failed, profilerConfig must not be nullptr");
  227. return PARAM_INVALID;
  228. }
  229. delete profiler_config;
  230. GELOGI("Successfully destroy prof config.");
  231. return SUCCESS;
  232. }
  233. Status aclgrphProfStart(aclgrphProfConfig *profiler_config) {
  234. if (profiler_config == nullptr) {
  235. GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
  236. return FAILED;
  237. }
  238. std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  239. if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
  240. GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
  241. return FAILED;
  242. }
  243. std::lock_guard<std::mutex> lock(g_prof_mutex_);
  244. // if command mode is set, just return
  245. if (ProfilingManager::Instance().ProfilingOn()) {
  246. GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
  247. return GE_PROF_MODE_CONFLICT;
  248. }
  249. if (!g_graph_prof_init_) {
  250. GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
  251. return GE_PROF_NOT_INIT;
  252. }
  253. Status ret = ProfStartProfiling(&profiler_config->config);
  254. if (ret != SUCCESS) {
  255. GELOGE(ret, "Start profiling failed, prof result = %d", ret);
  256. return FAILED;
  257. }
  258. std::vector<string> prof_params;
  259. if (!TransProfConfigToParam(profiler_config, prof_params)) {
  260. GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
  261. return PARAM_INVALID;
  262. }
  263. GraphLoader graph_loader;
  264. Command command;
  265. command.cmd_params.clear();
  266. command.cmd_type = kProfilingStart;
  267. command.cmd_params = prof_params;
  268. command.module_index = profiler_config->config.dataTypeConfig;
  269. GELOGI("Profiling will start, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(),
  270. prof_params[kDeviceListIndex].c_str(), command.module_index);
  271. ret = graph_loader.CommandHandle(command);
  272. if (ret != SUCCESS) {
  273. GELOGE(ret, "Handle profiling command failed");
  274. return FAILED;
  275. }
  276. GELOGI("Successfully execute GraphProfStartProfiling.");
  277. return SUCCESS;
  278. }
  279. Status aclgrphProfStop(aclgrphProfConfig *profiler_config) {
  280. if (profiler_config == nullptr) {
  281. GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
  282. return FAILED;
  283. }
  284. std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  285. if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
  286. GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
  287. return FAILED;
  288. }
  289. std::lock_guard<std::mutex> lock(g_prof_mutex_);
  290. // if command mode is set, just return
  291. if (ProfilingManager::Instance().ProfilingOn()) {
  292. GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
  293. return GE_PROF_MODE_CONFLICT;
  294. }
  295. if (!g_graph_prof_init_) {
  296. GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
  297. return GE_PROF_NOT_INIT;
  298. }
  299. for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
  300. uint64_t data_type_config;
  301. Status status = ProfGetDataTypeConfig(profiler_config->config.devIdList[i], data_type_config);
  302. if (status != SUCCESS) {
  303. GELOGE(status, "Prof get data type config failed, prof result = %d", status);
  304. return status;
  305. }
  306. if (data_type_config != profiler_config->config.dataTypeConfig) {
  307. GELOGE(FAILED, "data type config verify failed");
  308. return FAILED;
  309. }
  310. }
  311. std::vector<string> prof_params;
  312. if (!TransProfConfigToParam(profiler_config, prof_params)) {
  313. GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
  314. return PARAM_INVALID;
  315. }
  316. GraphLoader graph_loader;
  317. Command command;
  318. command.cmd_params.clear();
  319. command.cmd_type = kProfilingStop;
  320. command.cmd_params = prof_params;
  321. command.module_index = profiler_config->config.dataTypeConfig;
  322. GELOGI("Profiling will stop, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(),
  323. prof_params[kDeviceListIndex].c_str(), command.module_index);
  324. <<<<<<< HEAD:ge/client/ge_prof.cc
  325. Status ret = graph_loader.CommandHandle(command);
  326. =======
  327. ret = graph_loader.CommandHandle(command);
  328. >>>>>>> cd365aa247c64e30487d1e71e4f724a889848f80:src/ge/client/ge_prof.cc
  329. if (ret != SUCCESS) {
  330. GELOGE(ret, "Handle profiling command failed");
  331. return FAILED;
  332. }
  333. ret = ProfStopProfiling(&profiler_config->config);
  334. if (ret != SUCCESS) {
  335. GELOGE(ret, "Stop profiling failed, prof result = %d", ret);
  336. return ret;
  337. }
  338. GELOGI("Successfully execute GraphProfStopProfiling.");
  339. return SUCCESS;
  340. }
  341. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示