You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

op_task.cc 29 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "single_op/task/op_task.h"
  17. #include <google/protobuf/extension_set.h>
  18. #include <chrono>
  19. #include <thread>
  20. #include "aicpu/common/aicpu_task_struct.h"
  21. #include "common/dump/dump_manager.h"
  22. #include "common/dump/dump_op.h"
  23. #include "common/formats/formats.h"
  24. #include "framework/common/debug/log.h"
  25. #include "register/op_tiling.h"
  26. #include "runtime/rt.h"
  27. namespace ge {
  28. namespace {
  29. constexpr int kLaunchRetryTimes = 1000;
  30. constexpr int kSleepTime = 10;
  31. constexpr uint64_t kReleaseFlag = 1;
  32. constexpr int kCopyNum = 2;
  33. }
  34. Status OpTask::OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream) {
  35. if (DumpManager::GetInstance().GetDumpProperties().IsSingleOpNeedDump()) {
  36. GELOGI("Dump is open in single op,start to set dump info");
  37. std::vector<uint64_t> input_addrs;
  38. std::vector<uint64_t> output_adds;
  39. auto input_size = op_desc_->GetInputsSize();
  40. auto output_size = op_desc_->GetOutputsSize();
  41. auto all_size = io_addr.size();
  42. if (input_size + output_size != all_size) {
  43. GELOGE(FAILED, "io_addr size is not equal input and output size");
  44. return FAILED;
  45. }
  46. for (size_t i = 0; i < input_size; i++) {
  47. uint64_t input_addr = static_cast<uint64_t>(io_addr[i]);
  48. input_addrs.emplace_back(input_addr);
  49. }
  50. for (size_t j = 0; j < output_size; j++) {
  51. uint64_t output_addr = static_cast<uint64_t>(io_addr[input_size + j]);
  52. output_adds.emplace_back(output_addr);
  53. }
  54. dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream);
  55. auto status = dump_op_.LaunchDumpOp();
  56. if (status != SUCCESS) {
  57. GELOGE(status, "Launch dump op failed in single op");
  58. return status;
  59. }
  60. return SUCCESS;
  61. }
  62. GELOGI("Dump is not open in single op");
  63. return SUCCESS;
  64. }
  65. void TbeOpTask::SetStubFunc(const std::string &name, const void *stub_func) {
  66. this->stub_name_ = name;
  67. this->stub_func_ = stub_func;
  68. }
  69. void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim,
  70. const OpDescPtr &op_desc) {
  71. args_ = std::move(args);
  72. arg_size_ = arg_size;
  73. block_dim_ = block_dim;
  74. op_desc_ = op_desc;
  75. }
  76. void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; }
  77. const vector<int64_t> &OpTask::GetWorkspaceSizes() const { return workspace_sizes_; }
  78. void OpTask::SetWorkspaceSizes(const vector<int64_t> &workspace_sizes) { workspace_sizes_ = workspace_sizes; }
  79. TbeOpTask::~TbeOpTask() {
  80. if (sm_desc_ != nullptr) {
  81. (void)rtMemFreeManaged(sm_desc_);
  82. }
  83. if (tiling_buffer_ != nullptr) {
  84. (void)rtFree(tiling_buffer_);
  85. }
  86. }
  87. const void *TbeOpTask::GetArgs() const { return args_.get(); }
  88. size_t TbeOpTask::GetArgSize() const { return arg_size_; }
  89. const std::string &TbeOpTask::GetStubName() const { return stub_name_; }
  90. Status TbeOpTask::LaunchKernel(rtStream_t stream) {
  91. GELOGD("To invoke rtKernelLaunch. task = %s, block_dim = %u", this->stub_name_.c_str(), block_dim_);
  92. auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
  93. auto ret = rtKernelLaunch(stub_func_, block_dim_, args_.get(), static_cast<uint32_t>(arg_size_), sm_desc, stream);
  94. int retry_times = 0;
  95. while (ret != RT_ERROR_NONE && retry_times < kLaunchRetryTimes) {
  96. retry_times++;
  97. GELOGW("Retry after %d ms, retry_times: %d", kSleepTime, retry_times);
  98. std::this_thread::sleep_for(std::chrono::milliseconds(kSleepTime));
  99. ret = rtKernelLaunch(stub_func_, block_dim_, args_.get(), arg_size_, sm_desc, stream);
  100. }
  101. if (ret != RT_ERROR_NONE) {
  102. GELOGE(RT_FAILED, "Invoke rtKernelLaunch failed. ret = %d, task = %s", ret, this->stub_name_.c_str());
  103. return RT_FAILED;
  104. }
  105. GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());
  106. return SUCCESS;
  107. }
  108. Status TbeOpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
  109. GE_CHK_STATUS_RET_NOLOG(UpdateNodeByShape(input_desc, output_desc));
  110. // invoke OpParaCalculate
  111. GELOGD("Start to invoke OpParaCalculate.");
  112. optiling::OpRunInfo run_info;
  113. run_info.block_dim = 0;
  114. auto ret = optiling::OpParaCalculate(*node_, run_info);
  115. if (ret != GRAPH_SUCCESS) {
  116. GELOGE(FAILED, "Failed to invoke OpParaCalculate. ret = %u", ret);
  117. return FAILED;
  118. }
  119. SetWorkspaceSizes(run_info.workspaces);
  120. block_dim_ = run_info.block_dim;
  121. tiling_data_ = run_info.tiling_data.str();
  122. GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_,
  123. tiling_data_.size());
  124. return SUCCESS;
  125. }
  126. Status TbeOpTask::UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor) {
  127. int64_t storage_format_val = static_cast<Format>(FORMAT_RESERVED);
  128. (void)AttrUtils::GetInt(src_tensor, ge::ATTR_NAME_STORAGE_FORMAT, storage_format_val);
  129. auto storage_format = static_cast<Format>(storage_format_val);
  130. if (storage_format == FORMAT_RESERVED) {
  131. GELOGD("Storage format not set. update shape to [%s], and original shape to [%s]",
  132. src_tensor.GetShape().ToString().c_str(), src_tensor.GetOriginShape().ToString().c_str());
  133. dst_tensor.SetShape(src_tensor.GetShape());
  134. dst_tensor.SetOriginShape(src_tensor.GetOriginShape());
  135. } else {
  136. std::vector<int64_t> storage_shape;
  137. if (!AttrUtils::GetListInt(src_tensor, ge::ATTR_NAME_STORAGE_SHAPE, storage_shape)) {
  138. GELOGE(PARAM_INVALID, "Failed to get storage_shape while storage_format was set");
  139. return PARAM_INVALID;
  140. }
  141. GELOGD("Storage format set. update shape to [%s], and original shape to [%s]",
  142. GeShape(storage_shape).ToString().c_str(), src_tensor.GetShape().ToString().c_str());
  143. dst_tensor.SetShape(GeShape(std::move(storage_shape)));
  144. dst_tensor.SetOriginShape(src_tensor.GetShape());
  145. }
  146. return SUCCESS;
  147. }
  148. Status TbeOpTask::UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
  149. auto op_desc = node_->GetOpDesc();
  150. GE_CHECK_NOTNULL(op_desc);
  151. // Set runtime shape to node
  152. for (size_t i = 0; i < input_desc.size(); ++i) {
  153. auto tensor_desc = op_desc->MutableInputDesc(i);
  154. auto &runtime_tensor_desc = input_desc[i];
  155. GE_CHECK_NOTNULL(tensor_desc);
  156. GE_CHK_STATUS_RET(UpdateTensorDesc(runtime_tensor_desc, *tensor_desc));
  157. }
  158. for (size_t i = 0; i < output_desc.size(); ++i) {
  159. auto tensor_desc = op_desc->MutableOutputDesc(i);
  160. auto &runtime_tensor_desc = output_desc[i];
  161. GE_CHECK_NOTNULL(tensor_desc);
  162. GE_CHK_STATUS_RET(UpdateTensorDesc(runtime_tensor_desc, *tensor_desc));
  163. }
  164. return SUCCESS;
  165. }
  166. void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size) {
  167. node_ = node;
  168. tiling_buffer_ = tiling_buffer;
  169. max_tiling_size_ = max_tiling_size;
  170. }
  171. Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *> &outputs,
  172. const vector<void *> &workspaces, rtStream_t stream) {
  173. GELOGD("[%s] Start to launch kernel", node_->GetName().c_str());
  174. std::vector<void *> args;
  175. args.insert(args.end(), inputs.begin(), inputs.end());
  176. args.insert(args.end(), outputs.begin(), outputs.end());
  177. args.insert(args.end(), workspaces.begin(), workspaces.end());
  178. if (tiling_buffer_ != nullptr) {
  179. GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size());
  180. GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_, max_tiling_size_, tiling_data_.data(), tiling_data_.size(),
  181. RT_MEMCPY_HOST_TO_DEVICE_EX, stream));
  182. args.emplace_back(tiling_buffer_);
  183. }
  184. if (memcpy_s(args_.get(), arg_size_, args.data(), args.size() * sizeof(void *)) != EOK) {
  185. GELOGE(INTERNAL_ERROR, "[%s] Failed to update kernel args.", node_->GetName().c_str());
  186. return INTERNAL_ERROR;
  187. }
  188. GELOGD("[%s] Start to invoke rtKernelLaunch", node_->GetName().c_str());
  189. GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), arg_size_, nullptr, stream));
  190. GELOGD("[%s] Done invoking rtKernelLaunch successfully", node_->GetName().c_str());
  191. return SUCCESS;
  192. }
  193. AiCpuBaseTask::~AiCpuBaseTask() {
  194. if (ext_info_addr_dev_ != nullptr) {
  195. (void)rtFree(ext_info_addr_dev_);
  196. }
  197. }
  198. Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info) {
  199. if (kernel_ext_info.empty()) {
  200. GELOGI("Kernel_ext_info is empty, no need copy to device.");
  201. return SUCCESS;
  202. }
  203. int32_t unknown_shape_type_val = 0;
  204. (void) AttrUtils::GetInt(op_desc_, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
  205. GELOGD("Get unknown_type is %d.", unknown_shape_type_val);
  206. unknown_type_ = static_cast<UnknowShapeOpType>(unknown_shape_type_val);
  207. aicpu_ext_handle_.reset(new(std::nothrow) ::ge::hybrid::AicpuExtInfoHandler(op_desc_->GetName(),
  208. num_inputs_,
  209. num_outputs_,
  210. unknown_type_));
  211. GE_CHK_BOOL_RET_STATUS(aicpu_ext_handle_ != nullptr, FAILED, "Malloc aicpu_ext_handle mem failed!");
  212. Status ret = aicpu_ext_handle_->Parse(kernel_ext_info);
  213. if (ret != SUCCESS) {
  214. GELOGE(ret, "Parse kernel ext info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size());
  215. return ret;
  216. }
  217. GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, kernel_ext_info.size(), RT_MEMORY_HBM));
  218. GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, kernel_ext_info.size(),
  219. kernel_ext_info.data(), kernel_ext_info.size(), RT_MEMCPY_HOST_TO_DEVICE));
  220. return SUCCESS;
  221. }
  222. Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
  223. std::vector<GeTensorDesc> &output_desc) {
  224. GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_);
  225. if (num_inputs_ == 0 && num_outputs_ == 0) {
  226. GELOGI("No input and output, no need update ext info.");
  227. return SUCCESS;
  228. }
  229. GE_CHECK_NOTNULL(aicpu_ext_handle_);
  230. for (size_t i = 0; i < num_inputs_; ++i) {
  231. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(i, input_desc[i]),
  232. "Input[%zu] update input shape failed.", i);
  233. }
  234. if (unknown_type_ != DEPEND_COMPUTE) {
  235. for (size_t j = 0; j < num_outputs_; ++j) {
  236. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]),
  237. "Output[%zu] UpdateOutputShapeAndType failed.", j);
  238. // debug code
  239. GELOGD("No input and output, no need update ext info.");
  240. }
  241. }
  242. GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_,
  243. aicpu_ext_handle_->GetExtInfoLen(), // check size
  244. aicpu_ext_handle_->GetExtInfo(),
  245. aicpu_ext_handle_->GetExtInfoLen(),
  246. RT_MEMCPY_HOST_TO_DEVICE));
  247. GELOGI("Update ext info end.");
  248. return SUCCESS;
  249. }
  250. Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) {
  251. if (num_outputs_ == 0) {
  252. GELOGD("AiCpuBaseTask output_num is 0, no need update output shape.");
  253. return SUCCESS;
  254. }
  255. GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape.");
  256. GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(),
  257. aicpu_ext_handle_->GetExtInfoLen(),
  258. ext_info_addr_dev_,
  259. aicpu_ext_handle_->GetExtInfoLen(),
  260. RT_MEMCPY_DEVICE_TO_HOST));
  261. for (size_t i = 0; i < num_outputs_; ++i) {
  262. GeShape shape;
  263. DataType data_type;
  264. aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type);
  265. GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]),
  266. "AiCpuCCTask Update [%zu]th output shape failed.", i);
  267. }
  268. GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished.");
  269. return SUCCESS;
  270. }
  271. Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc) {
  272. auto shape_old = output_desc.GetShape();
  273. output_desc.SetShape(shape_new);
  274. GELOGD("Update AiCpuBaseTask shape from %s to %s", shape_old.ToString().c_str(), shape_new.ToString().c_str());
  275. auto origin_shape_old = output_desc.GetOriginShape();
  276. auto origin_format = output_desc.GetOriginFormat();
  277. auto format = output_desc.GetFormat();
  278. if (origin_format == format) {
  279. output_desc.SetOriginShape(shape_new);
  280. return SUCCESS;
  281. }
  282. std::vector<int64_t> origin_dims_new;
  283. auto trans_ret = formats::TransShape(format, shape_new.GetDims(),
  284. output_desc.GetDataType(), origin_format, origin_dims_new);
  285. GE_CHK_STATUS_RET(trans_ret,
  286. "AiCpuTask originFormat[%d] is not same as format[%d], but TransShape failed, shape=%s.",
  287. origin_format, format, shape_new.ToString().c_str());
  288. auto origin_shape_new = GeShape(origin_dims_new);
  289. output_desc.SetOriginShape(origin_shape_new);
  290. GELOGD("AiCpuTask originFormat[%d] is not same as format[%d], need update from %s ro %s.",
  291. origin_format, format, origin_shape_old.ToString().c_str(), origin_shape_new.ToString().c_str());
  292. return SUCCESS;
  293. }
  294. AiCpuTask::~AiCpuTask() {
  295. if (args_ != nullptr) {
  296. (void)rtFree(args_);
  297. }
  298. if (io_addr_ != nullptr) {
  299. (void)rtFree(io_addr_);
  300. }
  301. if (dynamic_flag_ && workspace_addr_ != nullptr) {
  302. (void)rtFree(workspace_addr_);
  303. }
  304. if (copy_workspace_buf_ != nullptr) {
  305. (void)rtFree(copy_workspace_buf_);
  306. }
  307. if (copy_ioaddr_dev_ != nullptr) {
  308. (void)rtFree(copy_ioaddr_dev_);
  309. }
  310. if (copy_input_release_flag_dev_ != nullptr) {
  311. (void)rtFree(copy_input_release_flag_dev_);
  312. }
  313. if (copy_input_data_size_dev_ != nullptr) {
  314. (void)rtFree(copy_input_data_size_dev_);
  315. }
  316. if (copy_input_src_dev_ != nullptr) {
  317. (void)rtFree(copy_input_src_dev_);
  318. }
  319. if (copy_input_dst_dev_ != nullptr) {
  320. (void)rtFree(copy_input_dst_dev_);
  321. }
  322. if (copy_task_args_buf_ != nullptr) {
  323. (void)rtFree(copy_task_args_buf_);
  324. }
  325. for (auto summary : output_summary_) {
  326. if (summary != nullptr) {
  327. (void)rtFree(summary);
  328. }
  329. }
  330. }
  331. const void *AiCpuTask::GetIOAddr() const { return io_addr_; }
  332. Status AiCpuTask::LaunchKernel(rtStream_t stream) {
  333. GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str());
  334. auto ret = rtMemcpyAsync(workspace_addr_, task_info_.size(), task_info_.data(), task_info_.size(),
  335. RT_MEMCPY_HOST_TO_DEVICE_EX, stream);
  336. if (ret != RT_ERROR_NONE) {
  337. GELOGE(RT_FAILED, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str());
  338. return RT_FAILED;
  339. }
  340. GELOGI("To invoke rtKernelLaunchEx. task = %s", this->op_type_.c_str());
  341. ret = rtKernelLaunchEx(args_, arg_size_, 0, stream);
  342. if (ret != RT_ERROR_NONE) {
  343. GELOGE(RT_FAILED, "Invoke rtKernelLaunch failed. ret = %d, task = %s", ret, this->op_type_.c_str());
  344. return RT_FAILED;
  345. }
  346. GELOGI("[TASK_INFO] is %s", this->task_info_.c_str());
  347. GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str());
  348. return SUCCESS;
  349. }
  350. Status AiCpuTask::PrepareCopyInputs(vector<void *> &outputs,
  351. const std::vector<void *> &out_shape_hbm) {
  352. std::vector<uint64_t> copy_input_release_flag;
  353. std::vector<uint64_t> copy_input_data_size;
  354. std::vector<uint64_t> copy_input_src;
  355. std::vector<uint64_t> copy_input_dst;
  356. for (size_t i = 0; i < num_outputs_; ++i) {
  357. const auto &summary = output_summary_host_[i];
  358. GELOGI("Node out[%zu] summary, shape data=0x%lx, shape data size=%lu, raw data=0x%lx, raw data size=%lu.",
  359. i, summary.shape_data_ptr, summary.shape_data_size,
  360. summary.raw_data_ptr, summary.raw_data_size);
  361. auto output = outputs[i];
  362. copy_input_release_flag.emplace_back(kReleaseFlag);
  363. copy_input_data_size.emplace_back(summary.raw_data_size);
  364. copy_input_src.emplace_back(summary.raw_data_ptr);
  365. copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output));
  366. const auto &shape_buffer = out_shape_hbm[i];
  367. copy_input_release_flag.emplace_back(kReleaseFlag);
  368. copy_input_data_size.emplace_back(summary.shape_data_size);
  369. copy_input_src.emplace_back(summary.shape_data_ptr);
  370. copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(shape_buffer));
  371. }
  372. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  373. GE_CHK_RT_RET(rtMemcpy(copy_input_release_flag_dev_, copy_input_buf_len,
  374. copy_input_release_flag.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  375. GE_CHK_RT_RET(rtMemcpy(copy_input_data_size_dev_, copy_input_buf_len,
  376. copy_input_data_size.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  377. GE_CHK_RT_RET(rtMemcpy(copy_input_src_dev_, copy_input_buf_len,
  378. copy_input_src.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  379. GE_CHK_RT_RET(rtMemcpy(copy_input_dst_dev_, copy_input_buf_len,
  380. copy_input_dst.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  381. return SUCCESS;
  382. }
  383. Status AiCpuTask::ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm) {
  384. for (size_t i = 0; i < num_outputs_; ++i) {
  385. auto &result_summary = output_summary_host_[i];
  386. GE_CHK_RT_RET(rtMemcpy(&result_summary, sizeof(aicpu::FWKAdapter::ResultSummary),
  387. output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary),
  388. RT_MEMCPY_DEVICE_TO_HOST));
  389. auto shape_data_size = result_summary.shape_data_size;
  390. void *shape_buffer = nullptr;
  391. GE_MAKE_GUARD_RTMEM(shape_buffer);
  392. GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM));
  393. out_shape_hbm.emplace_back(shape_buffer);
  394. }
  395. return SUCCESS;
  396. }
  397. Status AiCpuTask::CopyDataToHbm(vector<void *> &outputs,
  398. const std::vector<void *> &out_shape_hbm,
  399. rtStream_t stream) {
  400. GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs, out_shape_hbm));
  401. GE_CHK_RT_RET(rtKernelLaunchEx(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL),
  402. RT_KERNEL_DEFAULT, stream));
  403. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  404. return SUCCESS;
  405. }
  406. Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc,
  407. const std::vector<void *> &out_shape_hbm) {
  408. for (size_t i = 0; i < num_outputs_; ++i) {
  409. const auto &result_summary = output_summary_host_[i];
  410. std::vector<int64_t> shape_dims;
  411. const auto &shape_hbm = out_shape_hbm[i];
  412. uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
  413. std::unique_ptr<int64_t[]> shape_addr(new(std::nothrow) int64_t[dim_num]());
  414. GE_CHECK_NOTNULL(shape_addr);
  415. GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size,
  416. shape_hbm, result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));
  417. for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
  418. shape_dims.emplace_back(shape_addr[dim_idx]);
  419. GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]);
  420. }
  421. GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
  422. "AiCpuTask update [%zu]th output shape failed.", i);
  423. }
  424. return SUCCESS;
  425. }
  426. Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
  427. vector<void *> &outputs, rtStream_t stream) {
  428. if (num_outputs_ == 0) {
  429. GELOGI("Output num is 0, there is no need to update the output and size.");
  430. return SUCCESS;
  431. }
  432. GELOGI("Update shape and data by result summary begin.");
  433. std::vector<void *> out_shape_hbm;
  434. GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(out_shape_hbm),
  435. "Read ResultSummary and update output shape failed.");
  436. GE_CHK_STATUS_RET(CopyDataToHbm(outputs, out_shape_hbm, stream),
  437. "Copy data to output failed.");
  438. GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc, out_shape_hbm),
  439. "Update shape by hbm buffer failed.");
  440. GELOGI("Update shape and data by result summary end.");
  441. return SUCCESS;
  442. }
  443. Status AiCpuTask::SetIO(const vector<void *> &inputs, vector<void *> &outputs) {
  444. vector<uint64_t> io_addrs;
  445. io_addrs.reserve(num_inputs_ + num_outputs_);
  446. for (size_t i = 0; i < num_inputs_; ++i) {
  447. GE_CHECK_NOTNULL(inputs[i]);
  448. GELOGD("AiCpuTask input[%zu] addr = %p", i, inputs[i]);
  449. io_addrs.emplace_back(reinterpret_cast<uintptr_t>(inputs[i]));
  450. }
  451. if (unknown_type_ != DEPEND_COMPUTE) {
  452. for (size_t i = 0; i < num_outputs_; ++i) {
  453. GE_CHECK_NOTNULL(outputs[i]);
  454. GELOGD("AiCpuTask output[%zu] addr = %p", i, outputs[i]);
  455. io_addrs.emplace_back(reinterpret_cast<uintptr_t>(outputs[i]));
  456. }
  457. } else {
  458. for (size_t i = 0; i < num_outputs_; ++i) {
  459. void *summary_addr = output_summary_[i];
  460. io_addrs.emplace_back(reinterpret_cast<uintptr_t>(summary_addr));
  461. }
  462. }
  463. if (!io_addrs.empty()) {
  464. auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(io_addr_));
  465. GE_CHK_RT_RET(rtMemcpy(dst_io_addr,
  466. sizeof(uint64_t) * io_addrs.size(),
  467. &io_addrs[0],
  468. sizeof(uint64_t) * io_addrs.size(),
  469. RT_MEMCPY_HOST_TO_DEVICE));
  470. GE_CHECK_NOTNULL(dst_io_addr);
  471. };
  472. return SUCCESS;
  473. }
  474. Status AiCpuTask::InitForSummaryAndCopy() {
  475. if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
  476. GELOGI("Unknown_type is %d, output num is %d.", unknown_type_, num_outputs_);
  477. return SUCCESS;
  478. }
  479. output_summary_.resize(num_outputs_);
  480. constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
  481. for (size_t i = 0; i < num_outputs_; ++i) {
  482. GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
  483. }
  484. output_summary_host_.resize(num_outputs_);
  485. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  486. GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  487. GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  488. GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  489. GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  490. GE_CHK_RT_RET(rtMalloc(&copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM));
  491. std::vector<uint64_t> copy_io_addr;
  492. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
  493. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
  494. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
  495. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));
  496. const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size();
  497. GE_CHK_RT_RET(rtMalloc(&copy_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM));
  498. GE_CHK_RT_RET(rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size,
  499. copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE));
  500. return SUCCESS;
  501. }
  502. Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) {
  503. if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
  504. GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
  505. sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size());
  506. return PARAM_INVALID;
  507. }
  508. GE_CHK_RT_RET(rtMalloc(&copy_workspace_buf_, kernel_def.task_info_size(), RT_MEMORY_HBM));
  509. GE_CHK_RT_RET(rtMemcpy(copy_workspace_buf_, kernel_def.task_info_size(),
  510. kernel_def.task_info().data(), kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE));
  511. STR_FWK_OP_KERNEL aicpu_task = {0};
  512. auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL),
  513. kernel_def.args().data(), kernel_def.args().size());
  514. if (sec_ret != EOK) {
  515. GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
  516. return FAILED;
  517. }
  518. aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = reinterpret_cast<uintptr_t>(copy_ioaddr_dev_);
  519. aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = reinterpret_cast<uintptr_t>(copy_workspace_buf_);
  520. aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
  521. aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;
  522. GE_CHK_RT_RET(rtMemcpy(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL),
  523. &aicpu_task, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE));
  524. return SUCCESS;
  525. }
  526. Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
  527. const std::vector<void *> &inputs,
  528. std::vector<GeTensorDesc> &output_desc,
  529. std::vector<void *> &outputs,
  530. rtStream_t stream) {
  531. GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc));
  532. GE_CHK_STATUS_RET_NOLOG(SetIO(inputs, outputs));
  533. GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  534. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  535. if (unknown_type_ == DEPEND_SHAPE_RANGE) {
  536. GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  537. } else if (unknown_type_ == DEPEND_COMPUTE) {
  538. GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, outputs, stream));
  539. }
  540. return SUCCESS;
  541. }
  542. void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
  543. args_ = std::move(args);
  544. arg_size_ = arg_size;
  545. // The blockdim value is defult "1" for rtCpuKernelLaunch
  546. block_dim_ = 1;
  547. }
  548. void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; }
  549. void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; }
  550. void AiCpuCCTask::SetIoAddr(void *io_addr) { io_addr_ = io_addr; }
  551. const void *AiCpuCCTask::GetIOAddr() const { return io_addr_; }
  552. const void *AiCpuCCTask::GetArgs() const { return args_.get(); }
  553. size_t AiCpuCCTask::GetArgSize() const { return arg_size_; }
  554. AiCpuCCTask::~AiCpuCCTask() {
  555. }
  556. Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
  557. GELOGI("To invoke rtCpuKernelLaunch. block_dim = %u, so_name is %s, kernel_name is %s", block_dim_, so_name_.data(),
  558. kernel_name_.data());
  559. // sm_desc is nullptr, because l2 buffer does not support
  560. auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
  561. auto ret =
  562. rtCpuKernelLaunch(static_cast<const void *>(so_name_.data()), static_cast<const void *>(kernel_name_.data()),
  563. block_dim_, args_.get(), static_cast<uint32_t>(arg_size_), sm_desc, stream);
  564. if (ret != RT_ERROR_NONE) {
  565. GELOGE(RT_FAILED, "Invoke rtCpuKernelLaunch failed. ret = %d", ret);
  566. return RT_FAILED;
  567. }
  568. GELOGD("Invoke rtCpuKernelLaunch succeeded");
  569. return SUCCESS;
  570. }
  571. Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
  572. const std::vector<void *> &inputs,
  573. std::vector<GeTensorDesc> &output_desc,
  574. std::vector<void *> &outputs,
  575. rtStream_t stream) {
  576. GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED,
  577. "AiCpuCCTask unknown type[%d] is depend compute, it's not supported now.",
  578. unknown_type_);
  579. GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc));
  580. size_t arg_index = 0;
  581. auto *task_io_addr = reinterpret_cast<uintptr_t *>(io_addr_);
  582. GE_CHECK_NOTNULL(task_io_addr);
  583. for (auto &input : inputs) {
  584. task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(input);
  585. }
  586. for (auto &output : outputs) {
  587. task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(output);
  588. }
  589. GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  590. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  591. if (unknown_type_ == DEPEND_SHAPE_RANGE) {
  592. GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  593. }
  594. return SUCCESS;
  595. }
  596. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示