You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

op_task.cc 36 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "single_op/task/op_task.h"
  17. #include <google/protobuf/extension_set.h>
  18. #include <chrono>
  19. #include <thread>
  20. #include "aicpu/common/aicpu_task_struct.h"
  21. #include "common/dump/dump_manager.h"
  22. #include "common/dump/dump_op.h"
  23. #include "common/formats/formats.h"
  24. #include "common/math/math_util.h"
  25. #include "framework/common/debug/log.h"
  26. #include "register/op_tiling.h"
  27. #include "runtime/rt.h"
  28. #include "build_task_utils.h"
  29. namespace ge {
  30. namespace {
  31. constexpr int kLaunchRetryTimes = 1000;
  32. constexpr int kSleepTime = 10;
  33. constexpr uint64_t kReleaseFlag = 1;
  34. constexpr int kCopyNum = 2;
  35. void FreeHbm(void *var) {
  36. if (var) {
  37. (void)rtFree(var);
  38. }
  39. }
  40. } // namespace
  41. Status OpTask::OpenDump(rtStream_t stream) {
  42. if (DumpManager::GetInstance().GetDumpProperties().IsSingleOpNeedDump()) {
  43. GELOGI("Dump is open in single op, start to set dump info");
  44. std::vector<uint64_t> input_addrs;
  45. std::vector<uint64_t> output_adds;
  46. auto input_size = op_desc_->GetInputsSize();
  47. auto output_size = op_desc_->GetOutputsSize();
  48. uintptr_t *arg_base = nullptr;
  49. size_t arg_num = 0;
  50. GetIoAddr(arg_base, arg_num);
  51. if (arg_num < input_size + output_size) {
  52. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "io_addrs_for_dump_ size %zu is not equal input and output size %zu",
  53. arg_num,
  54. input_size + output_size);
  55. return ACL_ERROR_GE_INTERNAL_ERROR;
  56. }
  57. for (size_t i = 0; i < input_size; i++) {
  58. uint64_t input_addr = arg_base[i];
  59. input_addrs.emplace_back(input_addr);
  60. }
  61. for (size_t j = 0; j < output_size; j++) {
  62. uint64_t output_addr = arg_base[input_size + j];
  63. output_adds.emplace_back(output_addr);
  64. }
  65. dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream);
  66. auto status = dump_op_.LaunchDumpOp();
  67. if (status != SUCCESS) {
  68. GELOGE(status, "Launch dump op failed in single op");
  69. return status;
  70. }
  71. return SUCCESS;
  72. }
  73. GELOGI("Dump is not open in single op");
  74. return SUCCESS;
  75. }
  76. void TbeOpTask::SetStubFunc(const std::string &name, const void *stub_func) {
  77. this->stub_name_ = name;
  78. this->stub_func_ = stub_func;
  79. }
  80. void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim,
  81. const OpDescPtr &op_desc) {
  82. args_ = std::move(args);
  83. arg_size_ = arg_size;
  84. block_dim_ = block_dim;
  85. op_desc_ = op_desc;
  86. }
  87. void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; }
  88. void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) {
  89. model_name_ = model_name;
  90. model_id_ = model_id;
  91. }
  92. Status OpTask::GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id,
  93. uint32_t &block_dim) {
  94. model_name = model_name_;
  95. model_id = model_id_;
  96. block_dim = block_dim_;
  97. GE_CHECK_NOTNULL(op_desc_);
  98. op_name = op_desc_->GetName();
  99. return SUCCESS;
  100. }
  101. Status OpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
  102. return UNSUPPORTED;
  103. }
  104. Status OpTask::DoUpdateArgTable(const SingleOpModelParam &param, bool keep_workspace) {
  105. auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, keep_workspace);
  106. auto all_addresses = BuildTaskUtils::JoinAddresses(addresses);
  107. uintptr_t *arg_base = nullptr;
  108. size_t arg_num = 0;
  109. GetIoAddr(arg_base, arg_num);
  110. if (arg_num < all_addresses.size()) {
  111. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[%s] arg number mismatches, expect at least = %zu, but got = %zu",
  112. op_desc_->GetName().c_str(),
  113. all_addresses.size(),
  114. arg_num);
  115. return ACL_ERROR_GE_INTERNAL_ERROR;
  116. }
  117. for (void *addr : all_addresses) {
  118. *arg_base++ = reinterpret_cast<uintptr_t >(addr);
  119. }
  120. return SUCCESS;
  121. }
  122. Status OpTask::UpdateArgTable(const SingleOpModelParam &param) {
  123. return DoUpdateArgTable(param, true);
  124. }
  125. Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
  126. const vector<DataBuffer> &input_buffers,
  127. vector<GeTensorDesc> &output_desc,
  128. vector<DataBuffer> &output_buffers,
  129. rtStream_t stream) {
  130. return UNSUPPORTED;
  131. }
  132. uint32_t OpTask::GetTaskType() const { return kTaskTypeInvalid; }
  133. TbeOpTask::~TbeOpTask() {
  134. if (sm_desc_ != nullptr) {
  135. (void)rtMemFreeManaged(sm_desc_);
  136. }
  137. if (tiling_buffer_ != nullptr) {
  138. (void)rtFree(tiling_buffer_);
  139. }
  140. }
  141. const void *TbeOpTask::GetArgs() const { return args_.get(); }
  142. size_t TbeOpTask::GetArgSize() const { return arg_size_; }
  143. const std::string &TbeOpTask::GetStubName() const { return stub_name_; }
  144. uint32_t TbeOpTask::GetTaskType() const { return kTaskTypeAicore; }
  145. Status TbeOpTask::LaunchKernel(rtStream_t stream) {
  146. GELOGD("To invoke rtKernelLaunch. task = %s, block_dim = %u", this->stub_name_.c_str(), block_dim_);
  147. auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
  148. auto ret = rtKernelLaunch(stub_func_, block_dim_, args_.get(), static_cast<uint32_t>(arg_size_), sm_desc, stream);
  149. int retry_times = 0;
  150. while (ret != RT_ERROR_NONE && retry_times < kLaunchRetryTimes) {
  151. retry_times++;
  152. GELOGW("Retry after %d ms, retry_times: %d", kSleepTime, retry_times);
  153. std::this_thread::sleep_for(std::chrono::milliseconds(kSleepTime));
  154. ret = rtKernelLaunch(stub_func_, block_dim_, args_.get(), arg_size_, sm_desc, stream);
  155. }
  156. if (ret != RT_ERROR_NONE) {
  157. GELOGE(ret, "Invoke rtKernelLaunch failed. ret = %d, task = %s", ret, this->stub_name_.c_str());
  158. return RT_ERROR_TO_GE_STATUS(ret);
  159. }
  160. GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());
  161. auto status = OpenDump(stream);
  162. if (status != SUCCESS) {
  163. GELOGE(status, "Open dump failed in the tbe single op %s", this->stub_name_.c_str());
  164. return status;
  165. }
  166. return SUCCESS;
  167. }
  168. Status TbeOpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
  169. GE_CHK_STATUS_RET_NOLOG(UpdateNodeByShape(input_desc, output_desc));
  170. // invoke OpParaCalculate
  171. GELOGD("Start to invoke OpParaCalculate.");
  172. optiling::OpRunInfo run_info;
  173. run_info.block_dim = 0;
  174. auto ret = optiling::OpParaCalculate(*node_, run_info);
  175. if (ret != GRAPH_SUCCESS) {
  176. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "Failed to invoke OpParaCalculate. ret = %u", ret);
  177. return ACL_ERROR_GE_INTERNAL_ERROR;
  178. }
  179. block_dim_ = run_info.block_dim;
  180. tiling_data_ = run_info.tiling_data.str();
  181. GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_,
  182. tiling_data_.size());
  183. GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces");
  184. return SUCCESS;
  185. }
  186. Status TbeOpTask::UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor) {
  187. int64_t storage_format_val = static_cast<Format>(FORMAT_RESERVED);
  188. (void)AttrUtils::GetInt(src_tensor, ge::ATTR_NAME_STORAGE_FORMAT, storage_format_val);
  189. auto storage_format = static_cast<Format>(storage_format_val);
  190. if (storage_format == FORMAT_RESERVED) {
  191. GELOGD("Storage format not set. update shape to [%s], and original shape to [%s]",
  192. src_tensor.GetShape().ToString().c_str(), src_tensor.GetOriginShape().ToString().c_str());
  193. dst_tensor.SetShape(src_tensor.GetShape());
  194. dst_tensor.SetOriginShape(src_tensor.GetOriginShape());
  195. } else {
  196. std::vector<int64_t> storage_shape;
  197. if (!AttrUtils::GetListInt(src_tensor, ge::ATTR_NAME_STORAGE_SHAPE, storage_shape)) {
  198. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "Failed to get storage_shape while storage_format was set");
  199. return ACL_ERROR_GE_INTERNAL_ERROR;
  200. }
  201. GELOGD("Storage format set. update shape to [%s], and original shape to [%s]",
  202. GeShape(storage_shape).ToString().c_str(), src_tensor.GetShape().ToString().c_str());
  203. dst_tensor.SetShape(GeShape(std::move(storage_shape)));
  204. dst_tensor.SetOriginShape(src_tensor.GetShape());
  205. }
  206. return SUCCESS;
  207. }
  208. Status TbeOpTask::UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
  209. auto op_desc = node_->GetOpDesc();
  210. GE_CHECK_NOTNULL(op_desc);
  211. // Set runtime shape to node
  212. for (size_t i = 0; i < input_desc.size(); ++i) {
  213. auto tensor_desc = op_desc->MutableInputDesc(i);
  214. auto &runtime_tensor_desc = input_desc[i];
  215. GE_CHECK_NOTNULL(tensor_desc);
  216. GE_CHK_STATUS_RET(UpdateTensorDesc(runtime_tensor_desc, *tensor_desc));
  217. }
  218. for (size_t i = 0; i < output_desc.size(); ++i) {
  219. auto tensor_desc = op_desc->MutableOutputDesc(i);
  220. auto &runtime_tensor_desc = output_desc[i];
  221. GE_CHECK_NOTNULL(tensor_desc);
  222. GE_CHK_STATUS_RET(UpdateTensorDesc(runtime_tensor_desc, *tensor_desc));
  223. }
  224. return SUCCESS;
  225. }
  226. void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size) {
  227. node_ = node;
  228. tiling_buffer_ = tiling_buffer;
  229. max_tiling_size_ = max_tiling_size;
  230. }
  231. Status TbeOpTask::AllocateWorkspaces(const vector<int64_t> &workspace_sizes) {
  232. static const std::string kPurpose("malloc workspace memory for dynamic op.");
  233. if (workspace_sizes.empty()) {
  234. GELOGD("No need to allocate workspace.");
  235. return SUCCESS;
  236. }
  237. int64_t total_size = 0;
  238. std::vector<int64_t> ws_offsets;
  239. for (auto ws_size : workspace_sizes) {
  240. // alignment and padding should be done in OpParaCalculate
  241. if (CheckInt64AddOverflow(total_size, ws_size) != SUCCESS) {
  242. return ACL_ERROR_GE_INTERNAL_ERROR;
  243. }
  244. ws_offsets.emplace_back(total_size);
  245. total_size += ws_size;
  246. }
  247. GELOGD("Total workspace size is %ld", total_size);
  248. GE_CHECK_NOTNULL(stream_resource_);
  249. auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast<size_t>(total_size));
  250. if (ws_base == nullptr) {
  251. GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size);
  252. return ACL_ERROR_GE_MEMORY_ALLOCATION;
  253. }
  254. GELOGD("Done allocating workspace memory successfully.");
  255. for (auto ws_offset : ws_offsets) {
  256. workspaces_.emplace_back(ws_base + ws_offset);
  257. }
  258. return SUCCESS;
  259. }
  260. Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
  261. const vector<DataBuffer> &input_buffers,
  262. vector<GeTensorDesc> &output_desc,
  263. vector<DataBuffer> &output_buffers,
  264. rtStream_t stream) {
  265. GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo(input_desc, output_desc));
  266. GELOGD("[%s] Start to launch kernel", node_->GetName().c_str());
  267. std::vector<void *> args;
  268. for (auto &buffer : input_buffers) {
  269. args.emplace_back(buffer.data);
  270. }
  271. for (auto &buffer : output_buffers) {
  272. args.emplace_back(buffer.data);
  273. }
  274. for (auto &buffer : workspaces_) {
  275. args.emplace_back(buffer);
  276. }
  277. if (tiling_buffer_ != nullptr) {
  278. GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size());
  279. GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_, max_tiling_size_, tiling_data_.data(), tiling_data_.size(),
  280. RT_MEMCPY_HOST_TO_DEVICE_EX, stream));
  281. args.emplace_back(tiling_buffer_);
  282. }
  283. if (memcpy_s(args_.get(), arg_size_, args.data(), args.size() * sizeof(void *)) != EOK) {
  284. GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[%s] Failed to update kernel args.",
  285. node_->GetName().c_str());
  286. return ACL_ERROR_GE_MEMORY_OPERATE_FAILED;
  287. }
  288. GELOGD("[%s] Start to invoke rtKernelLaunch", node_->GetName().c_str());
  289. GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), arg_size_, nullptr, stream));
  290. GELOGD("[%s] Done invoking rtKernelLaunch successfully", node_->GetName().c_str());
  291. return SUCCESS;
  292. }
  293. void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  294. arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  295. arg_count = arg_size_ / sizeof(void *);
  296. if (tiling_buffer_ != nullptr) {
  297. --arg_count;
  298. }
  299. }
  300. AiCpuBaseTask::~AiCpuBaseTask() {
  301. if (ext_info_addr_dev_ != nullptr) {
  302. (void)rtFree(ext_info_addr_dev_);
  303. }
  304. }
  305. Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id) {
  306. if (kernel_ext_info.empty()) {
  307. GELOGI("Kernel_ext_info is empty, no need copy to device.");
  308. return SUCCESS;
  309. }
  310. int32_t unknown_shape_type_val = 0;
  311. (void) AttrUtils::GetInt(op_desc_, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
  312. GELOGD("Get unknown_type is %d.", unknown_shape_type_val);
  313. unknown_type_ = static_cast<UnknowShapeOpType>(unknown_shape_type_val);
  314. aicpu_ext_handle_.reset(new(std::nothrow) ::ge::hybrid::AicpuExtInfoHandler(op_desc_->GetName(),
  315. num_inputs_,
  316. num_outputs_,
  317. unknown_type_));
  318. GE_CHK_BOOL_RET_STATUS(aicpu_ext_handle_ != nullptr, ACL_ERROR_GE_MEMORY_ALLOCATION, "Malloc aicpu_ext_handle mem failed!");
  319. Status ret = aicpu_ext_handle_->Parse(kernel_ext_info);
  320. if (ret != SUCCESS) {
  321. GELOGE(ret, "Parse kernel ext info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size());
  322. return ret;
  323. }
  324. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateSessionInfo(ULLONG_MAX, kernel_id, false),
  325. "UpdateSessionInfo failed.");
  326. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(true), "UpdateExecuteMode failed.");
  327. GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(), RT_MEMORY_HBM));
  328. GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(),
  329. aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(),
  330. RT_MEMCPY_HOST_TO_DEVICE));
  331. return SUCCESS;
  332. }
  333. Status AiCpuBaseTask::SetInputConst() {
  334. input_is_const_.clear();
  335. const vector<bool> v_is_input_const = op_desc_->GetIsInputConst();
  336. for (size_t i = 0; i < op_desc_->GetAllInputsSize(); ++i) {
  337. const GeTensorDescPtr tensor_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(i));
  338. if (tensor_desc == nullptr) {
  339. GELOGD("SingleOp: %s, Index: %zu, has no input", op_desc_->GetName().c_str(), i);
  340. continue;
  341. }
  342. if (i < v_is_input_const.size() && v_is_input_const[i]) {
  343. GELOGD("SingleOp: %s, Index: %zu, input is const", op_desc_->GetName().c_str(), i);
  344. input_is_const_.push_back(true);
  345. continue;
  346. }
  347. input_is_const_.push_back(false);
  348. }
  349. return SUCCESS;
  350. }
  351. Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
  352. std::vector<GeTensorDesc> &output_desc,
  353. rtStream_t stream) {
  354. GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_);
  355. GE_CHECK_NOTNULL(aicpu_ext_handle_);
  356. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(false), "UpdateExecuteMode failed.");
  357. if (num_inputs_ == 0 && num_outputs_ == 0) {
  358. GELOGI("No input and output, no need update ext info.");
  359. return SUCCESS;
  360. }
  361. size_t non_const_index = 0;
  362. for (size_t input_index = 0; input_index < num_inputs_; input_index++) {
  363. if (input_index < input_is_const_.size() && input_is_const_[input_index]) {
  364. // get input_desc from op_desc if const input, num_inputs_ is op_desc_ input_size
  365. auto const_input_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(input_index));
  366. GE_CHECK_NOTNULL(const_input_desc);
  367. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, *const_input_desc),
  368. "Input[%zu] update input shape failed.", input_index);
  369. continue;
  370. }
  371. GE_CHK_BOOL_RET_STATUS(non_const_index < input_desc.size(), ACL_ERROR_GE_PARAM_INVALID,
  372. "Input_desc size is %zu, but get non_const_index is %zu",
  373. input_desc.size(), non_const_index);
  374. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, input_desc[non_const_index]),
  375. "Input[%zu] update input shape failed.", input_index);
  376. non_const_index++;
  377. }
  378. if (unknown_type_ != DEPEND_COMPUTE) {
  379. for (size_t j = 0; j < num_outputs_; ++j) {
  380. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]),
  381. "Output[%zu] UpdateOutputShapeAndType failed.", j);
  382. }
  383. }
  384. GE_CHK_RT_RET(rtMemcpyAsync(ext_info_addr_dev_,
  385. aicpu_ext_handle_->GetExtInfoLen(), // check size
  386. aicpu_ext_handle_->GetExtInfo(),
  387. aicpu_ext_handle_->GetExtInfoLen(),
  388. RT_MEMCPY_HOST_TO_DEVICE_EX,
  389. stream));
  390. GELOGI("Update ext info end.");
  391. return SUCCESS;
  392. }
  393. Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) {
  394. if (num_outputs_ == 0) {
  395. GELOGD("AiCpuBaseTask output_num is 0, no need update output shape.");
  396. return SUCCESS;
  397. }
  398. GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape.");
  399. GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(),
  400. aicpu_ext_handle_->GetExtInfoLen(),
  401. ext_info_addr_dev_,
  402. aicpu_ext_handle_->GetExtInfoLen(),
  403. RT_MEMCPY_DEVICE_TO_HOST));
  404. for (size_t i = 0; i < num_outputs_; ++i) {
  405. GeShape shape;
  406. DataType data_type;
  407. aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type);
  408. GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]),
  409. "AiCpuCCTask Update [%zu]th output shape failed.", i);
  410. }
  411. GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished.");
  412. return SUCCESS;
  413. }
  414. Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc) {
  415. auto shape_old = output_desc.GetShape();
  416. output_desc.SetShape(shape_new);
  417. GELOGD("Update AiCpuBaseTask shape from %s to %s", shape_old.ToString().c_str(), shape_new.ToString().c_str());
  418. auto origin_shape_old = output_desc.GetOriginShape();
  419. auto origin_format = output_desc.GetOriginFormat();
  420. auto format = output_desc.GetFormat();
  421. if (origin_format == format) {
  422. output_desc.SetOriginShape(shape_new);
  423. return SUCCESS;
  424. }
  425. std::vector<int64_t> origin_dims_new;
  426. auto trans_ret = formats::TransShape(format, shape_new.GetDims(),
  427. output_desc.GetDataType(), origin_format, origin_dims_new);
  428. GE_CHK_STATUS_RET(trans_ret,
  429. "AiCpuTask originFormat[%d] is not same as format[%d], but TransShape failed, shape=%s.",
  430. origin_format, format, shape_new.ToString().c_str());
  431. auto origin_shape_new = GeShape(origin_dims_new);
  432. output_desc.SetOriginShape(origin_shape_new);
  433. GELOGD("AiCpuTask originFormat[%d] is not same as format[%d], need update from %s ro %s.",
  434. origin_format, format, origin_shape_old.ToString().c_str(), origin_shape_new.ToString().c_str());
  435. return SUCCESS;
  436. }
  437. Status AiCpuBaseTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) {
  438. uintptr_t *arg_base = nullptr;
  439. size_t arg_num = 0;
  440. GetIoAddr(arg_base, arg_num);
  441. // input number and output number was check in ValidateParams
  442. size_t non_const_index = 0;
  443. for (size_t input_index = 0; input_index < num_inputs_; input_index++) {
  444. if (input_index < input_is_const_.size() && input_is_const_[input_index]) {
  445. // const input no need update addr
  446. GE_CHECK_NOTNULL(arg_base);
  447. GELOGD("AICpuTask input[%zu] addr = %lu", input_index, *arg_base);
  448. arg_base++;
  449. continue;
  450. }
  451. GE_CHK_BOOL_RET_STATUS(non_const_index < inputs.size(), ACL_ERROR_GE_PARAM_INVALID,
  452. "Input size is %zu, but get non_const_index is %zu",
  453. inputs.size(), non_const_index);
  454. auto addr = inputs[non_const_index].data;
  455. GE_CHECK_NOTNULL(addr);
  456. GELOGD("AICpuTask input[%zu] addr = %p", input_index, addr);
  457. *arg_base++ = reinterpret_cast<uintptr_t>(addr);
  458. non_const_index++;
  459. }
  460. for (size_t i = 0; i < outputs.size(); ++i) {
  461. auto addr = outputs[i].data;
  462. GE_CHECK_NOTNULL(addr);
  463. GELOGD("AICpuTask output[%zu] addr = %p", i, addr);
  464. *arg_base++ = reinterpret_cast<uintptr_t>(addr);
  465. }
  466. return SUCCESS;
  467. }
  468. AiCpuTask::~AiCpuTask() {
  469. FreeHbm(args_);
  470. FreeHbm(io_addr_);
  471. if (dynamic_flag_) {
  472. FreeHbm(workspace_addr_);
  473. }
  474. FreeHbm(copy_workspace_buf_);
  475. FreeHbm(copy_ioaddr_dev_);
  476. FreeHbm(copy_input_release_flag_dev_);
  477. FreeHbm(copy_input_data_size_dev_);
  478. FreeHbm(copy_input_src_dev_);
  479. FreeHbm(copy_input_dst_dev_);
  480. FreeHbm(copy_task_args_buf_);
  481. for (auto summary : output_summary_) {
  482. FreeHbm(summary);
  483. }
  484. for (auto out_shape : out_shape_hbm_) {
  485. FreeHbm(out_shape);
  486. }
  487. }
  488. Status AiCpuTask::LaunchKernel(rtStream_t stream) {
  489. GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str());
  490. auto ret = rtMemcpyAsync(io_addr_,
  491. io_addr_size_,
  492. io_addr_host_.data(),
  493. io_addr_host_.size() * sizeof(void *),
  494. RT_MEMCPY_HOST_TO_DEVICE_EX,
  495. stream);
  496. if (ret != RT_ERROR_NONE) {
  497. GELOGE(ret, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str());
  498. return RT_ERROR_TO_GE_STATUS(ret);
  499. }
  500. GELOGI("To invoke rtKernelLaunchEx. task = %s", this->op_type_.c_str());
  501. ret = rtKernelLaunchEx(args_, arg_size_, 0, stream);
  502. if (ret != RT_ERROR_NONE) {
  503. GELOGE(ret, "Invoke rtKernelLaunch failed. ret = %d, task = %s", ret, this->op_type_.c_str());
  504. return RT_ERROR_TO_GE_STATUS(ret);
  505. }
  506. GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str());
  507. auto status = OpenDump(stream);
  508. if (status != SUCCESS) {
  509. GELOGE(status, "Open dump failed in aicpu single op %s", this->op_type_.c_str());
  510. return status;
  511. }
  512. GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str());
  513. return SUCCESS;
  514. }
  515. Status AiCpuTask::PrepareCopyInputs(vector<DataBuffer> &outputs) {
  516. std::vector<uint64_t> copy_input_release_flag;
  517. std::vector<uint64_t> copy_input_data_size;
  518. std::vector<uint64_t> copy_input_src;
  519. std::vector<uint64_t> copy_input_dst;
  520. for (size_t i = 0; i < num_outputs_; ++i) {
  521. const auto &summary = output_summary_host_[i];
  522. GELOGI("Node out[%zu] summary, shape data=0x%lx, shape data size=%lu, raw data=0x%lx, raw data size=%lu.",
  523. i, summary.shape_data_ptr, summary.shape_data_size,
  524. summary.raw_data_ptr, summary.raw_data_size);
  525. auto output = outputs[i];
  526. copy_input_release_flag.emplace_back(kReleaseFlag);
  527. if (summary.raw_data_size > 0) {
  528. copy_input_data_size.emplace_back(output.length);
  529. } else {
  530. copy_input_data_size.emplace_back(summary.raw_data_size);
  531. }
  532. copy_input_src.emplace_back(summary.raw_data_ptr);
  533. copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output.data));
  534. const auto &shape_buffer = out_shape_hbm_[i];
  535. copy_input_release_flag.emplace_back(kReleaseFlag);
  536. copy_input_data_size.emplace_back(summary.shape_data_size);
  537. copy_input_src.emplace_back(summary.shape_data_ptr);
  538. copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(shape_buffer));
  539. }
  540. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  541. GE_CHK_RT_RET(rtMemcpy(copy_input_release_flag_dev_, copy_input_buf_len,
  542. copy_input_release_flag.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  543. GE_CHK_RT_RET(rtMemcpy(copy_input_data_size_dev_, copy_input_buf_len,
  544. copy_input_data_size.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  545. GE_CHK_RT_RET(rtMemcpy(copy_input_src_dev_, copy_input_buf_len,
  546. copy_input_src.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  547. GE_CHK_RT_RET(rtMemcpy(copy_input_dst_dev_, copy_input_buf_len,
  548. copy_input_dst.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  549. return SUCCESS;
  550. }
  551. Status AiCpuTask::ReadResultSummaryAndPrepareMemory() {
  552. for (size_t i = 0; i < num_outputs_; ++i) {
  553. auto &result_summary = output_summary_host_[i];
  554. GE_CHK_RT_RET(rtMemcpy(&result_summary, sizeof(aicpu::FWKAdapter::ResultSummary),
  555. output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary),
  556. RT_MEMCPY_DEVICE_TO_HOST));
  557. auto shape_data_size = result_summary.shape_data_size;
  558. void *shape_buffer = nullptr;
  559. if (shape_data_size > 0) {
  560. GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM));
  561. }
  562. out_shape_hbm_.emplace_back(shape_buffer);
  563. }
  564. return SUCCESS;
  565. }
  566. Status AiCpuTask::CopyDataToHbm(vector<DataBuffer> &outputs,
  567. rtStream_t stream) {
  568. GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs));
  569. GE_CHK_RT_RET(rtKernelLaunchEx(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL),
  570. RT_KERNEL_DEFAULT, stream));
  571. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  572. return SUCCESS;
  573. }
  574. Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) {
  575. for (size_t i = 0; i < num_outputs_; ++i) {
  576. const auto &result_summary = output_summary_host_[i];
  577. std::vector<int64_t> shape_dims;
  578. if (result_summary.shape_data_size > 0) {
  579. const auto &shape_hbm = out_shape_hbm_[i];
  580. uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
  581. std::unique_ptr<int64_t[]> shape_addr(new(std::nothrow) int64_t[dim_num]());
  582. GE_CHECK_NOTNULL(shape_addr);
  583. GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size,
  584. shape_hbm, result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));
  585. for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
  586. shape_dims.emplace_back(shape_addr[dim_idx]);
  587. GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]);
  588. }
  589. }
  590. GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
  591. "AiCpuTask update [%zu]th output shape failed.", i);
  592. }
  593. return SUCCESS;
  594. }
  595. Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
  596. vector<DataBuffer> &outputs,
  597. rtStream_t stream) {
  598. if (num_outputs_ == 0) {
  599. GELOGI("Output num is 0, there is no need to update the output and size.");
  600. return SUCCESS;
  601. }
  602. GELOGI("Update shape and data by result summary begin.");
  603. for (auto out_shape : out_shape_hbm_) {
  604. FreeHbm(out_shape);
  605. }
  606. out_shape_hbm_.clear();
  607. GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(),
  608. "Read ResultSummary and update output shape failed.");
  609. GE_CHK_STATUS_RET(CopyDataToHbm(outputs, stream),
  610. "Copy data to output failed.");
  611. GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc),
  612. "Update shape by hbm buffer failed.");
  613. for (auto out_shape : out_shape_hbm_) {
  614. FreeHbm(out_shape);
  615. }
  616. out_shape_hbm_.clear();
  617. GELOGI("Update shape and data by result summary end.");
  618. return SUCCESS;
  619. }
  620. Status AiCpuTask::InitForSummaryAndCopy() {
  621. if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
  622. GELOGI("Unknown_type is %d, output num is %zu.", unknown_type_, num_outputs_);
  623. return SUCCESS;
  624. }
  625. output_summary_.resize(num_outputs_);
  626. constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
  627. for (size_t i = 0; i < num_outputs_; ++i) {
  628. GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
  629. }
  630. output_summary_host_.resize(num_outputs_);
  631. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  632. GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  633. GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  634. GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  635. GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  636. GE_CHK_RT_RET(rtMalloc(&copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM));
  637. std::vector<uint64_t> copy_io_addr;
  638. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
  639. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
  640. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
  641. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));
  642. const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size();
  643. GE_CHK_RT_RET(rtMalloc(&copy_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM));
  644. GE_CHK_RT_RET(rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size,
  645. copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE));
  646. return SUCCESS;
  647. }
  648. Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) {
  649. if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
  650. GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
  651. sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size());
  652. return ACL_ERROR_GE_PARAM_INVALID;
  653. }
  654. GE_CHK_RT_RET(rtMalloc(&copy_workspace_buf_, kernel_def.task_info_size(), RT_MEMORY_HBM));
  655. GE_CHK_RT_RET(rtMemcpy(copy_workspace_buf_, kernel_def.task_info_size(),
  656. kernel_def.task_info().data(), kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE));
  657. STR_FWK_OP_KERNEL aicpu_task = {0};
  658. auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL),
  659. kernel_def.args().data(), kernel_def.args().size());
  660. if (sec_ret != EOK) {
  661. GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "memcpy failed, ret: %d", sec_ret);
  662. return ACL_ERROR_GE_MEMORY_OPERATE_FAILED;
  663. }
  664. aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = reinterpret_cast<uintptr_t>(copy_ioaddr_dev_);
  665. aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = reinterpret_cast<uintptr_t>(copy_workspace_buf_);
  666. aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
  667. aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;
  668. GE_CHK_RT_RET(rtMemcpy(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL),
  669. &aicpu_task, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE));
  670. return SUCCESS;
  671. }
  672. Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
  673. const std::vector<DataBuffer> &input_buffers,
  674. std::vector<GeTensorDesc> &output_desc,
  675. std::vector<DataBuffer> &output_buffers,
  676. rtStream_t stream) {
  677. GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
  678. if (unknown_type_ == DEPEND_COMPUTE) {
  679. std::vector<DataBuffer> summary_buffers;
  680. for (size_t i = 0; i < num_outputs_; ++i) {
  681. summary_buffers.emplace_back(output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary), false);
  682. }
  683. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, summary_buffers));
  684. } else {
  685. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
  686. }
  687. GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  688. if (unknown_type_ == DEPEND_SHAPE_RANGE) {
  689. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  690. GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  691. } else if (unknown_type_ == DEPEND_COMPUTE) {
  692. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  693. GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, output_buffers, stream));
  694. }
  695. return SUCCESS;
  696. }
  697. Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam &param) {
  698. // aicpu do not have workspace, for now
  699. return DoUpdateArgTable(param, false);
  700. }
  701. uint32_t AiCpuBaseTask::GetTaskType() const { return kTaskTypeAicpu; }
  702. void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  703. arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data());
  704. arg_count = io_addr_host_.size();
  705. }
  706. void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
  707. args_ = std::move(args);
  708. arg_size_ = arg_size;
  709. // The blockdim value is defult "1" for rtCpuKernelLaunch
  710. block_dim_ = 1;
  711. }
  712. void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; }
  713. void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; }
  714. void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; }
  715. const void *AiCpuCCTask::GetArgs() const { return args_.get(); }
  716. size_t AiCpuCCTask::GetArgSize() const { return arg_size_; }
  717. AiCpuCCTask::~AiCpuCCTask() {
  718. }
  719. Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
  720. GELOGI("To invoke rtCpuKernelLaunch. block_dim = %u, so_name is %s, kernel_name is %s", block_dim_, so_name_.data(),
  721. kernel_name_.data());
  722. // sm_desc is nullptr, because l2 buffer does not support
  723. auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
  724. auto ret = rtCpuKernelLaunchWithFlag(static_cast<const void *>(so_name_.data()),
  725. static_cast<const void *>(kernel_name_.data()),
  726. block_dim_, args_.get(), static_cast<uint32_t>(arg_size_),
  727. sm_desc, stream, dump_flag_);
  728. if (ret != RT_ERROR_NONE) {
  729. GELOGE(ret, "Invoke rtCpuKernelLaunch failed. ret = %d", ret);
  730. return RT_ERROR_TO_GE_STATUS(ret);
  731. }
  732. GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str());
  733. GELOGD("Invoke rtCpuKernelLaunch succeeded");
  734. auto status = OpenDump(stream);
  735. if (status != SUCCESS) {
  736. GELOGE(status, "Open dump failed in the aicpucc single op %s", this->kernel_name_.c_str());
  737. return status;
  738. }
  739. return SUCCESS;
  740. }
  741. Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
  742. const std::vector<DataBuffer> &input_buffers,
  743. std::vector<GeTensorDesc> &output_desc,
  744. std::vector<DataBuffer> &output_buffers,
  745. rtStream_t stream) {
  746. GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
  747. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
  748. GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  749. if (unknown_type_ == DEPEND_SHAPE_RANGE) {
  750. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  751. GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  752. }
  753. return SUCCESS;
  754. }
  755. void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  756. arg_base = io_addr_;
  757. arg_count = io_addr_num_;
  758. }
  759. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示