You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aicpu_constant_folding_pass.cc 28 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "graph/passes/aicpu_constant_folding_pass.h"
  17. #include <memory>
  18. #include <vector>
  19. #include "framework/common/debug/log.h"
  20. #include "common/ge/ge_util.h"
  21. #include "framework/common/types.h"
  22. #include "framework/common/debug/ge_log.h"
  23. #include "graph/debug/ge_attr_define.h"
  24. #include "graph/utils/attr_utils.h"
  25. #include "graph/utils/node_utils.h"
  26. #include "graph/utils/op_desc_utils.h"
  27. #include "graph/utils/type_utils.h"
  28. #include "init/gelib.h"
  29. #include "opskernel_manager/ops_kernel_builder_manager.h"
  30. namespace {
  31. const char *const kKernelLibName = "aicpu_tf_kernel";
  32. const char *const kNotSupported = "0";
  33. const uint64_t kReleaseFlag = 1;
  34. const uint64_t kOpsFlag = 1;
  35. const uint64_t kDouble = 2;
  36. } // namespace
  37. namespace ge {
  38. Status AicpuConstantFoldingPass::Run(ge::NodePtr &node) {
  39. GE_CHECK_NOTNULL(node);
  40. GELOGD("Start aicpu constant folding on node [%s]", node->GetName().c_str());
  41. if (IsSkipFold(node)) {
  42. return SUCCESS;
  43. }
  44. vector<ConstGeTensorPtr> weight_vec;
  45. bool flag = CheckInput(node, weight_vec);
  46. if (!flag) {
  47. return SUCCESS;
  48. }
  49. OpDescPtr node_desc = node->GetOpDesc(); // checked before
  50. vector<DataPtrInfo> data_vec;
  51. vector<AddrAndType> input_addrs;
  52. vector<uint64_t> output_addrs;
  53. Status ret = GetInputAddrs(weight_vec, input_addrs);
  54. if (ret != SUCCESS) {
  55. ReleaseMemory(input_addrs, output_addrs, data_vec);
  56. return SUCCESS;
  57. }
  58. ret = GetOutputAddrs(node_desc, output_addrs);
  59. if (ret != SUCCESS) {
  60. ReleaseMemory(input_addrs, output_addrs, data_vec);
  61. return SUCCESS;
  62. }
  63. ret = LaunchSingleOpRunTask(node, input_addrs, output_addrs);
  64. if (ret != SUCCESS) {
  65. ReleaseMemory(input_addrs, output_addrs, data_vec);
  66. return SUCCESS;
  67. }
  68. GELOGI("[Node:%s] Launch singleOpRunTask success", node->GetName().c_str());
  69. vector<uint64_t> data_infos;
  70. ret = GenerateDataPtrInfo(output_addrs, data_vec, data_infos);
  71. if (ret != SUCCESS) {
  72. ReleaseMemory(input_addrs, output_addrs, data_vec);
  73. return SUCCESS;
  74. }
  75. GELOGI("[Node:%s] Generate dataPtrInfo success", node->GetName().c_str());
  76. ret = LaunchMemCopyTask(data_infos);
  77. if (ret != SUCCESS) {
  78. ReleaseMemory(input_addrs, output_addrs, data_vec);
  79. return SUCCESS;
  80. }
  81. GELOGI("[Node:%s] Launch memCopyTask success", node->GetName().c_str());
  82. vector<GeTensorPtr> outputs;
  83. ret = GenerateGeTensor(node_desc, data_vec, outputs);
  84. if (ret != SUCCESS) {
  85. ReleaseMemory(input_addrs, output_addrs, data_vec);
  86. return SUCCESS;
  87. }
  88. ReleaseMemory(input_addrs, output_addrs, data_vec);
  89. GELOGI("[Node:%s] Generate geTensor success", node->GetName().c_str());
  90. return Folding(node, outputs);
  91. }
  92. bool AicpuConstantFoldingPass::CheckInput(const NodePtr &node, vector<ConstGeTensorPtr> &weight_vec) {
  93. OpDescPtr node_desc = node->GetOpDesc();
  94. if (node_desc == nullptr) {
  95. GELOGW("Opdesc of %s is null", node->GetName().c_str());
  96. return false;
  97. }
  98. DataType data_type = node_desc->GetOutputDesc(0).GetDataType();
  99. Format format = node_desc->GetOutputDesc(0).GetFormat();
  100. GELOGD("Current [node:%s, type:%s] info: format: %s, datatype:%s", node->GetName().c_str(), node->GetType().c_str(),
  101. TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str());
  102. auto input_nodes = OpDescUtils::GetConstInputNode(*node);
  103. if (input_nodes.empty() || input_nodes.size() != node_desc->GetInputsSize()) {
  104. GELOGD("Const input nodes size is %zu, and nodeDesc inputsSize is %zu, skip fold.", input_nodes.size(),
  105. node_desc->GetInputsSize());
  106. return false;
  107. }
  108. weight_vec = OpDescUtils::GetInputData(input_nodes);
  109. return true;
  110. }
  111. Status AicpuConstantFoldingPass::GetInputAddrs(const vector<ConstGeTensorPtr> &weight_vec,
  112. vector<AddrAndType> &input_addrs) {
  113. if (weight_vec.empty()) {
  114. REPORT_INNER_ERROR("E19999", "Param weight_vec is empty, check invalid");
  115. GELOGE(FAILED, "[Check][Param] Weight is null");
  116. return FAILED;
  117. }
  118. for (const ConstGeTensorPtr &weight : weight_vec) {
  119. void *input_addr = nullptr;
  120. GE_CHK_RT_RET(rtMalloc(&input_addr, weight->GetData().size(), RT_MEMORY_HBM));
  121. rtError_t rt_ret = rtMemcpy(input_addr, weight->GetData().size(), weight->GetData().data(),
  122. weight->GetData().size(), RT_MEMCPY_HOST_TO_DEVICE);
  123. if (rt_ret != RT_ERROR_NONE) {
  124. REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X",
  125. weight->GetData().size(), rt_ret);
  126. GELOGE(rt_ret, "[Call][RtMemcpy] failed, size:%zu, ret = 0x%X", weight->GetData().size(), rt_ret);
  127. GE_CHK_RT(rtFree(input_addr));
  128. return FAILED;
  129. }
  130. AddrAndType input_info = {static_cast<uint64_t>(reinterpret_cast<uintptr_t>(input_addr)), kData};
  131. input_addrs.emplace_back(input_info);
  132. }
  133. return SUCCESS;
  134. }
  135. Status AicpuConstantFoldingPass::GetOutputAddrs(const OpDescPtr &node_desc, vector<uint64_t> &output_addrs) {
  136. if (node_desc->GetOutputsSize() == 0) {
  137. REPORT_INNER_ERROR("E19999", "Ouput desc size of op:%s(%s) is 0, check invalid",
  138. node_desc->GetName().c_str(), node_desc->GetType().c_str());
  139. GELOGE(FAILED, "[Get][OutputsSize] Ouput desc size of op:%s(%s) is 0",
  140. node_desc->GetName().c_str(), node_desc->GetType().c_str());
  141. return FAILED;
  142. }
  143. for (size_t i = 0; i < node_desc->GetOutputsSize(); ++i) {
  144. void *summary_addr = nullptr;
  145. GE_CHK_RT_RET(rtMalloc(&summary_addr, sizeof(aicpu::FWKAdapter::ResultSummary), RT_MEMORY_HBM));
  146. output_addrs.emplace_back(static_cast<uint64_t>(reinterpret_cast<uintptr_t>(summary_addr)));
  147. }
  148. return SUCCESS;
  149. }
  150. Status AicpuConstantFoldingPass::GenerateDataPtrInfo(const vector<uint64_t> &output_addrs,
  151. vector<DataPtrInfo> &data_vec, vector<uint64_t> &data_infos) {
  152. for (uint64_t output_addr : output_addrs) {
  153. aicpu::FWKAdapter::ResultSummary result_summary;
  154. GE_CHK_RT_RET(rtMemcpy(&result_summary, sizeof(aicpu::FWKAdapter::ResultSummary),
  155. reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(output_addr)),
  156. sizeof(aicpu::FWKAdapter::ResultSummary), RT_MEMCPY_DEVICE_TO_HOST));
  157. void *raw_data_addr = nullptr;
  158. GE_CHK_RT_RET(rtMalloc(&raw_data_addr, result_summary.raw_data_size, RT_MEMORY_HBM));
  159. void *shape_data_addr = nullptr;
  160. // shape_data_size = 0 means scalar
  161. if (result_summary.shape_data_size != 0) {
  162. rtError_t rt_ret = rtMalloc(&shape_data_addr, result_summary.shape_data_size, RT_MEMORY_HBM);
  163. if (rt_ret != RT_ERROR_NONE) {
  164. REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%lu, ret = 0x%X",
  165. result_summary.shape_data_size, rt_ret);
  166. GELOGE(rt_ret, "[Call][RtMalloc] failed, size:%lu, ret = 0x%X", result_summary.shape_data_size, rt_ret);
  167. GE_CHK_RT(rtFree(raw_data_addr));
  168. return FAILED;
  169. }
  170. }
  171. DataPtrInfo raw_data_info;
  172. raw_data_info.release_flag = kReleaseFlag;
  173. raw_data_info.data_size = result_summary.raw_data_size;
  174. raw_data_info.src_ptr = result_summary.raw_data_ptr;
  175. raw_data_info.dst_ptr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(raw_data_addr));
  176. data_vec.emplace_back(raw_data_info);
  177. DataPtrInfo shape_data_info;
  178. shape_data_info.release_flag = kReleaseFlag;
  179. shape_data_info.data_size = result_summary.shape_data_size;
  180. shape_data_info.src_ptr = result_summary.shape_data_ptr;
  181. shape_data_info.dst_ptr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(shape_data_addr));
  182. data_vec.emplace_back(shape_data_info);
  183. }
  184. for (const DataPtrInfo &data_info : data_vec) {
  185. data_infos.emplace_back(static_cast<uint64_t>(reinterpret_cast<uintptr_t>(&data_info)));
  186. }
  187. return SUCCESS;
  188. }
  189. Status AicpuConstantFoldingPass::UpdateWorkSpaceAddr(string &task_info, STR_FWK_OP_KERNEL &task) {
  190. // Update the workspace_addr
  191. if (task_info.empty()) {
  192. REPORT_INNER_ERROR("E19999", "Param task_info is empty, check invalid");
  193. GELOGE(FAILED, "[Check][Param] task_info is empty ");
  194. return FAILED;
  195. }
  196. void *workspace_addr = nullptr;
  197. GE_CHK_RT_RET(rtMalloc(&workspace_addr, task_info.size(), RT_MEMORY_HBM));
  198. rtError_t rt_ret =
  199. rtMemcpy(workspace_addr, task_info.size(), task_info.data(), task_info.size(), RT_MEMCPY_HOST_TO_DEVICE);
  200. if (rt_ret != RT_ERROR_NONE) {
  201. REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X",
  202. task_info.size(), rt_ret);
  203. GELOGE(rt_ret, "[Call][RtMemcpy] failed, size:%zu, ret = 0x%X", task_info.size(), rt_ret);
  204. GE_CHK_RT(rtFree(workspace_addr));
  205. return FAILED;
  206. }
  207. uint64_t workspace_base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(workspace_addr));
  208. task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = workspace_base_addr;
  209. return SUCCESS;
  210. }
  211. Status AicpuConstantFoldingPass::UpdateInputAndOutputAddr(const vector<uint64_t> &io_addrs, STR_FWK_OP_KERNEL &task) {
  212. auto addrs_size = sizeof(uint64_t) * (io_addrs.size());
  213. if (addrs_size <= 0) {
  214. REPORT_INNER_ERROR("E19999", "Param io_addrs size is 0, check invalid");
  215. GELOGE(FAILED, "[Check][Param] addrs_size is less than 1 ");
  216. return FAILED;
  217. }
  218. void *input_output_addr = nullptr;
  219. GE_CHK_RT_RET(rtMalloc(&input_output_addr, addrs_size, RT_MEMORY_HBM));
  220. rtError_t rt_ret = rtMemcpy(input_output_addr, addrs_size, io_addrs.data(), addrs_size, RT_MEMCPY_HOST_TO_DEVICE);
  221. if (rt_ret != RT_ERROR_NONE) {
  222. REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X",
  223. addrs_size, rt_ret);
  224. GELOGE(rt_ret, "[Call][RtMemcpy] failed, size:%zu, ret = 0x%X", addrs_size, rt_ret);
  225. GE_CHK_RT(rtFree(input_output_addr));
  226. return FAILED;
  227. }
  228. uint64_t in_out_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(input_output_addr));
  229. task.fwkKernelBase.fwk_kernel.inputOutputAddr = in_out_addr;
  230. return SUCCESS;
  231. }
  232. Status AicpuConstantFoldingPass::UpdateSingleOpAddr(string &task_info, const vector<AddrAndType> &input_addrs,
  233. const vector<uint64_t> &outputs_addr_vec, STR_FWK_OP_KERNEL &task) {
  234. // Build the SingleOpAddr
  235. vector<uint64_t> inputs_addr_vec;
  236. for (const auto &item : input_addrs) {
  237. inputs_addr_vec.push_back(item.input_addr);
  238. }
  239. vector<uint64_t> io_addrs;
  240. io_addrs.insert(io_addrs.end(), inputs_addr_vec.begin(), inputs_addr_vec.end());
  241. io_addrs.insert(io_addrs.end(), outputs_addr_vec.begin(), outputs_addr_vec.end());
  242. Status ret = UpdateInputAndOutputAddr(io_addrs, task);
  243. if (ret != SUCCESS) {
  244. GELOGE(ret, "[Update][InputAndOutputAddr] failed, ret:%d", ret);
  245. return ret;
  246. }
  247. ret = UpdateWorkSpaceAddr(task_info, task);
  248. if (ret != SUCCESS) {
  249. GELOGE(ret, "[Update][WorkSpaceAddr] failed, ret:%d", ret);
  250. return ret;
  251. }
  252. return SUCCESS;
  253. }
  254. Status AicpuConstantFoldingPass::UpdateMemCopyAddr(string &task_info, const vector<uint64_t> &data_infos,
  255. vector<uint64_t> &internal_addrs, STR_FWK_OP_KERNEL &task) {
  256. vector<uint64_t> release_flags;
  257. vector<uint64_t> data_sizes;
  258. vector<uint64_t> src_addrs;
  259. vector<uint64_t> dst_addrs;
  260. for (auto item : data_infos) {
  261. auto *data_info_ptr = reinterpret_cast<DataPtrInfo *>(reinterpret_cast<uintptr_t>(item)); // pointer cannot be null
  262. release_flags.push_back(data_info_ptr->release_flag);
  263. data_sizes.push_back(data_info_ptr->data_size);
  264. src_addrs.push_back(data_info_ptr->src_ptr);
  265. dst_addrs.push_back(data_info_ptr->dst_ptr);
  266. }
  267. vector<vector<uint64_t>> inputs = {release_flags, data_sizes, src_addrs, dst_addrs};
  268. auto data_size = sizeof(uint64_t) * (data_infos.size());
  269. vector<uint64_t> io_addrs;
  270. if (data_infos.size() > 0) {
  271. for (const auto &item : inputs) {
  272. void *input_addr_ptr = nullptr;
  273. GE_CHK_RT_RET(rtMalloc(&input_addr_ptr, data_size, RT_MEMORY_HBM));
  274. rtError_t rt_ret = rtMemcpy(input_addr_ptr, data_size, item.data(), data_size, RT_MEMCPY_HOST_TO_DEVICE);
  275. if (rt_ret != RT_ERROR_NONE) {
  276. REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X",
  277. data_size, rt_ret);
  278. GELOGE(rt_ret, "[Call][RtMemcpy] failed, size:%zu, ret = 0x%X", data_size, rt_ret);
  279. GE_CHK_RT(rtFree(input_addr_ptr));
  280. return FAILED;
  281. }
  282. uint64_t input_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(input_addr_ptr));
  283. io_addrs.push_back(input_addr);
  284. }
  285. }
  286. internal_addrs = io_addrs;
  287. Status ret = UpdateInputAndOutputAddr(io_addrs, task);
  288. if (ret != SUCCESS) {
  289. GELOGE(ret, "[Update][InputAndOutputAddr] failed, ret:%d", ret);
  290. return ret;
  291. }
  292. ret = UpdateWorkSpaceAddr(task_info, task);
  293. if (ret != SUCCESS) {
  294. GELOGE(ret, "[Update][WorkSpaceAddr] failed, ret:%d", ret);
  295. return ret;
  296. }
  297. return SUCCESS;
  298. }
  299. Status AicpuConstantFoldingPass::LaunchSingleOpRunTask(const NodePtr &node, const vector<AddrAndType> &input_addrs,
  300. const vector<uint64_t> &output_addrs) {
  301. void *task_buf = nullptr;
  302. auto instance_ptr = ge::GELib::GetInstance();
  303. if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
  304. REPORT_INNER_ERROR("E19999", "GeLib is not init before, check invalid");
  305. GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Check][Param] GE is not initialized");
  306. return GE_CLI_GE_NOT_INITIALIZED;
  307. }
  308. auto kernel_builder = OpsKernelBuilderManager::Instance().GetOpsKernelBuilder(kKernelLibName);
  309. if (kernel_builder == nullptr) {
  310. REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed",
  311. kKernelLibName);
  312. GELOGE(FAILED, "[Get][OpsKernelBuilder] by name:%s failed", kKernelLibName);
  313. return FAILED;
  314. }
  315. STR_FWK_OP_KERNEL aicpu_task;
  316. aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = 0;
  317. aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = 0;
  318. aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
  319. aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;
  320. std::string task_info;
  321. Status ret = kernel_builder->GenSingleOpRunTask(node, aicpu_task, task_info);
  322. if (ret != SUCCESS) {
  323. return ret;
  324. }
  325. std::function<void()> callback = [&]() {
  326. void *input_output_ptr =
  327. reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr));
  328. if (input_output_ptr != nullptr) {
  329. GE_CHK_RT(rtFree(input_output_ptr));
  330. }
  331. void *workspace_addr_ptr =
  332. reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr));
  333. if (workspace_addr_ptr != nullptr) {
  334. GE_CHK_RT(rtFree(workspace_addr_ptr));
  335. }
  336. };
  337. GE_MAKE_GUARD(release, callback);
  338. ret = UpdateSingleOpAddr(task_info, input_addrs, output_addrs, aicpu_task);
  339. if (ret != SUCCESS) {
  340. GELOGE(ret, "[Update][SingleOpAddr] failed, ret:%d", ret);
  341. return ret;
  342. }
  343. ret = GenerateTaskForLaunch(aicpu_task, task_buf);
  344. if (ret != SUCCESS) {
  345. GELOGE(ret, "[Generate][Task] For Launch failed, ret:%d", ret);
  346. return ret;
  347. }
  348. ret = KernelLaunch(task_buf);
  349. if (ret != SUCCESS) {
  350. GELOGE(ret, "[Call][KernelLaunch] failed, ret:%d", ret);
  351. return ret;
  352. }
  353. return SUCCESS;
  354. }
  355. Status AicpuConstantFoldingPass::LaunchMemCopyTask(const vector<uint64_t> &data_infos) {
  356. void *task_buf = nullptr;
  357. auto instance_ptr = ge::GELib::GetInstance();
  358. if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
  359. REPORT_INNER_ERROR("E19999", "GeLib is not init before, check invalid");
  360. GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Check][Param] GE is not initialized");
  361. return GE_CLI_GE_NOT_INITIALIZED;
  362. }
  363. auto kernel_builder = OpsKernelBuilderManager::Instance().GetOpsKernelBuilder(kKernelLibName);
  364. if (kernel_builder == nullptr) {
  365. REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed",
  366. kKernelLibName);
  367. GELOGE(FAILED, "[Get][OpsKernelBuilder] by name:%s failed", kKernelLibName);
  368. return FAILED;
  369. }
  370. STR_FWK_OP_KERNEL aicpu_task;
  371. aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = 0;
  372. aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = 0;
  373. aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
  374. aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;
  375. std::string task_info;
  376. Status ret = kernel_builder->GenMemCopyTask(data_infos.size(), aicpu_task, task_info);
  377. if (ret != SUCCESS) {
  378. return ret;
  379. }
  380. vector<uint64_t> internal_addrs;
  381. std::function<void()> callback = [&]() {
  382. for (auto item : internal_addrs) {
  383. GE_CHK_RT(rtFree(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(item)))); // pointer cannot be null
  384. }
  385. void *input_output_ptr =
  386. reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr));
  387. if (input_output_ptr != nullptr) {
  388. GE_CHK_RT(rtFree(input_output_ptr));
  389. }
  390. void *workspace_addr_ptr =
  391. reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr));
  392. if (workspace_addr_ptr != nullptr) {
  393. GE_CHK_RT(rtFree(workspace_addr_ptr));
  394. }
  395. };
  396. GE_MAKE_GUARD(release, callback);
  397. ret = UpdateMemCopyAddr(task_info, data_infos, internal_addrs, aicpu_task);
  398. if (ret != SUCCESS) {
  399. GELOGE(ret, "[Update][MemCopyAddr] failed, ret:%d", ret);
  400. return ret;
  401. }
  402. ret = GenerateTaskForLaunch(aicpu_task, task_buf);
  403. if (ret != SUCCESS) {
  404. GELOGE(ret, "[Generate][Task] For Launch failed, ret:%d", ret);
  405. return ret;
  406. }
  407. ret = KernelLaunch(task_buf);
  408. if (ret != SUCCESS) {
  409. GELOGE(ret, "[Call][KernelLaunch] failed, ret:%d", ret);
  410. return ret;
  411. }
  412. return SUCCESS;
  413. }
  414. Status AicpuConstantFoldingPass::GenerateTaskForLaunch(STR_FWK_OP_KERNEL &aicpu_task, void *&task_buf) {
  415. GE_CHK_RT_RET(rtMalloc(&task_buf, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM));
  416. rtError_t rt_ret = rtMemcpy(task_buf, sizeof(STR_FWK_OP_KERNEL), reinterpret_cast<void *>(&aicpu_task),
  417. sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE);
  418. if (rt_ret != RT_ERROR_NONE) {
  419. REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X",
  420. sizeof(STR_FWK_OP_KERNEL), rt_ret);
  421. GELOGE(rt_ret, "[Call][RtMemcpy] failed, size:%zu, ret = 0x%X", sizeof(STR_FWK_OP_KERNEL), rt_ret);
  422. GE_CHK_RT(rtFree(task_buf));
  423. return FAILED;
  424. }
  425. return SUCCESS;
  426. }
  427. Status AicpuConstantFoldingPass::KernelLaunch(void *task_buf) {
  428. rtModel_t model = nullptr;
  429. rtStream_t stream = nullptr;
  430. rtStream_t stream_run = nullptr;
  431. std::function<void()> callback = [&]() {
  432. if (task_buf != nullptr) {
  433. GE_CHK_RT(rtFree(task_buf));
  434. }
  435. if (model != nullptr) {
  436. GE_CHK_RT(rtModelDestroy(model));
  437. }
  438. if (stream != nullptr) {
  439. GE_CHK_RT(rtStreamDestroy(stream));
  440. }
  441. if (stream_run != nullptr) {
  442. GE_CHK_RT(rtStreamDestroy(stream_run));
  443. }
  444. };
  445. GE_MAKE_GUARD(release, callback);
  446. rtError_t rt_ret = rtModelCreate(&model, 0);
  447. if (rt_ret != RT_ERROR_NONE) {
  448. REPORT_CALL_ERROR("E19999", "Call rtModelCreate failed, ret = 0x%X",
  449. rt_ret);
  450. GELOGE(rt_ret, "[Create][Model] failed, ret = 0x%X", rt_ret);
  451. return FAILED;
  452. }
  453. rt_ret = rtStreamCreate(&stream, 0);
  454. if (rt_ret != RT_ERROR_NONE) {
  455. REPORT_CALL_ERROR("E19999", "Call rtStreamCreate failed, ret = 0x%X",
  456. rt_ret);
  457. GELOGE(rt_ret, "[Create][Stream] failed, ret = 0x%X", rt_ret);
  458. return FAILED;
  459. }
  460. rt_ret = rtModelBindStream(model, stream, 0);
  461. if (rt_ret != RT_ERROR_NONE) {
  462. REPORT_CALL_ERROR("E19999", "Call rtModelBindStream failed, ret = 0x%X",
  463. rt_ret);
  464. GELOGE(rt_ret, "[Call][RtModelBindStream] failed, ret = 0x%X", rt_ret);
  465. return FAILED;
  466. }
  467. rt_ret = rtKernelLaunchEx(task_buf, sizeof(STR_FWK_OP_KERNEL), 0, stream);
  468. if (rt_ret != RT_ERROR_NONE) {
  469. REPORT_CALL_ERROR("E19999", "Call rtModelBindStream failed, ret = 0x%X",
  470. rt_ret);
  471. GELOGE(rt_ret, "[Call][RtModelBindStream] failed, ret = 0x%X", rt_ret);
  472. return FAILED;
  473. }
  474. rt_ret = rtModelLoadComplete(model);
  475. if (rt_ret != RT_ERROR_NONE) {
  476. REPORT_CALL_ERROR("E19999", "Call rtModelLoadComplete failed, ret = 0x%X",
  477. rt_ret);
  478. GELOGE(rt_ret, "[Call][RtModelLoadComplete] failed, ret = 0x%X", rt_ret);
  479. return FAILED;
  480. }
  481. rt_ret = rtStreamCreate(&stream_run, 0);
  482. if (rt_ret != RT_ERROR_NONE) {
  483. REPORT_CALL_ERROR("E19999", "Call rtStreamCreate failed, ret = 0x%X",
  484. rt_ret);
  485. GELOGE(rt_ret, "[Call][RtStreamCreate] failed, ret = 0x%X", rt_ret);
  486. return FAILED;
  487. }
  488. rt_ret = rtModelExecute(model, stream_run, 0);
  489. if (rt_ret != RT_ERROR_NONE) {
  490. REPORT_CALL_ERROR("E19999", "Call rtModelExecute failed, ret = 0x%X",
  491. rt_ret);
  492. GELOGE(rt_ret, "[Call][RtModelExecute] failed, ret = 0x%X", rt_ret);
  493. return FAILED;
  494. }
  495. rt_ret = rtStreamSynchronize(stream_run);
  496. if (rt_ret != RT_ERROR_NONE) {
  497. REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize failed, ret = 0x%X",
  498. rt_ret);
  499. GELOGE(rt_ret, "[Call][RtStreamSynchronize] failed, ret = 0x%X", rt_ret);
  500. return FAILED;
  501. }
  502. return SUCCESS;
  503. }
  504. Status AicpuConstantFoldingPass::GenerateGeTensor(const OpDescPtr &node_desc, const vector<DataPtrInfo> &data_vec,
  505. vector<GeTensorPtr> &outputs) {
  506. if ((node_desc->GetOutputsSize() * kDouble) != data_vec.size()) {
  507. REPORT_INNER_ERROR("E19999", "Output desc size:%zu of op:%s(%s), after multi 2, not equal to data_vec.size:%zu, "
  508. "check invalid", node_desc->GetOutputsSize(),
  509. node_desc->GetName().c_str(), node_desc->GetType().c_str(), data_vec.size());
  510. GELOGE(FAILED, "[Check][Param] Output desc size:%zu of op:%s(%s), after multi 2, not equal to data_vec.size:%zu",
  511. node_desc->GetOutputsSize(), node_desc->GetName().c_str(), node_desc->GetType().c_str(), data_vec.size());
  512. return FAILED;
  513. }
  514. for (size_t i = 0; i < node_desc->GetOutputsSize(); i++) {
  515. auto output_tensor_desc = node_desc->GetOutputDesc(static_cast<uint32_t>(i));
  516. GeTensorPtr output_ptr = MakeShared<GeTensor>(output_tensor_desc);
  517. if (output_ptr == nullptr) {
  518. REPORT_CALL_ERROR("E19999", "New GeTensor failed");
  519. GELOGE(FAILED, "[New][GeTensor] failed");
  520. return FAILED;
  521. }
  522. const DataPtrInfo &raw_data_info = data_vec.at(i * kDouble);
  523. uint64_t raw_data_size = raw_data_info.data_size;
  524. std::unique_ptr<uint8_t[]> data_addr(new (std::nothrow) uint8_t[raw_data_size]());
  525. if (data_addr == nullptr) {
  526. REPORT_CALL_ERROR("E19999", "New Buffer failed, size:%lu",
  527. raw_data_size);
  528. GELOGE(MEMALLOC_FAILED, "[New][Buffer] failed, size:%lu", raw_data_size);
  529. return INTERNAL_ERROR;
  530. }
  531. GE_CHK_RT_RET(rtMemcpy(data_addr.get(), raw_data_size,
  532. reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(raw_data_info.dst_ptr)), raw_data_size,
  533. RT_MEMCPY_DEVICE_TO_HOST));
  534. GE_IF_BOOL_EXEC(output_ptr->SetData(data_addr.get(), raw_data_size) != GRAPH_SUCCESS,
  535. GELOGE(FAILED, "[Set][Data] for node:%s output[%zu] failed", node_desc->GetName().c_str(), i);
  536. return FAILED);
  537. GELOGD("GenerateGeTensor: raw_data_size %lu", raw_data_size);
  538. const DataPtrInfo &shape_data_info = data_vec.at(i * kDouble + 1);
  539. uint64_t shape_data_size = shape_data_info.data_size;
  540. GELOGD("GenerateGeTensor: shape_data_size %lu", shape_data_size);
  541. if (shape_data_size == 0) {
  542. GELOGW("node[%s] outshape is scalar, skip copy shape", node_desc->GetName().c_str());
  543. output_ptr->MutableTensorDesc().SetShape(GeShape());
  544. outputs.emplace_back(output_ptr);
  545. continue;
  546. }
  547. uint64_t dim_num = shape_data_size / sizeof(uint64_t);
  548. std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
  549. if (shape_addr == nullptr) {
  550. REPORT_CALL_ERROR("E19999", "New Buffer failed, size:%lu",
  551. dim_num);
  552. GELOGE(MEMALLOC_FAILED, "[New][Buffer] failed, size:%lu", dim_num);
  553. return INTERNAL_ERROR;
  554. }
  555. GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), shape_data_size,
  556. reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(shape_data_info.dst_ptr)),
  557. shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));
  558. std::vector<int64_t> shape_dims;
  559. for (size_t j = 0; j < dim_num; j++) {
  560. shape_dims.push_back(shape_addr[j]);
  561. GELOGD("GenerateGeTensor: dim %ld", shape_addr[j]);
  562. }
  563. output_ptr->MutableTensorDesc().SetShape(GeShape(shape_dims));
  564. outputs.emplace_back(output_ptr);
  565. }
  566. return SUCCESS;
  567. }
  568. void AicpuConstantFoldingPass::ReleaseMemory(const vector<AddrAndType> &input_addrs,
  569. const vector<uint64_t> &output_addrs,
  570. const vector<DataPtrInfo> &data_vec) {
  571. for (const auto &item : input_addrs) {
  572. GE_CHK_RT(rtFree(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(item.input_addr))));
  573. }
  574. for (auto item : output_addrs) {
  575. GE_CHK_RT(rtFree(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(item))));
  576. }
  577. for (const auto &item : data_vec) {
  578. auto dst_ptr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(item.dst_ptr));
  579. if (dst_ptr != nullptr) {
  580. GE_CHK_RT(rtFree(dst_ptr));
  581. }
  582. }
  583. }
  584. bool AicpuConstantFoldingPass::IsSkipFold(const ge::NodePtr &node) {
  585. GE_CHECK_NOTNULL(node);
  586. string type = node->GetType();
  587. if (type == ge::FRAMEWORKOP) {
  588. if (!ge::AttrUtils::GetStr(node->GetOpDesc(), ge::ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type)) {
  589. GELOGW("Skip aicpu constant folding on frameworkop node [%s]", node->GetName().c_str());
  590. return true;
  591. }
  592. }
  593. auto instance_ptr = ge::GELib::GetInstance();
  594. if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
  595. REPORT_INNER_ERROR("E19999", "GeLib is not init before, check invalid");
  596. GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Check][Param] GE is not initialized");
  597. return true;
  598. }
  599. OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(kKernelLibName);
  600. if (kernel_info == nullptr) {
  601. REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed",
  602. kKernelLibName);
  603. GELOGE(FAILED, "[Get][OpsKernelInfoStore] by name:%s failed", kKernelLibName);
  604. return true;
  605. }
  606. std::string check_result;
  607. kernel_info->opsFlagCheck(*node, check_result);
  608. if (check_result.empty()) {
  609. REPORT_CALL_ERROR("E19999", "Call opsFlagCheck failed, ops kernel name:%s, op:%s(%s)",
  610. kKernelLibName, node->GetName().c_str(), node->GetType().c_str());
  611. GELOGE(FAILED, "[Call][OpsFlagCheck] failed, ops kernel name:%s, op:%s(%s)",
  612. kKernelLibName, node->GetName().c_str(), node->GetType().c_str());
  613. return true;
  614. }
  615. return check_result.substr(0, kOpsFlag) == kNotSupported;
  616. }
  617. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示