You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

op_task.cc 58 kB

5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "single_op/task/op_task.h"
  17. #include <google/protobuf/extension_set.h>
  18. #include <chrono>
  19. #include <thread>
  20. #include "aicpu/common/aicpu_task_struct.h"
  21. #include "common/dump/dump_manager.h"
  22. #include "common/dump/dump_op.h"
  23. #include "common/profiling/profiling_manager.h"
  24. #include "common/formats/formats.h"
  25. #include "common/math/math_util.h"
  26. #include "framework/common/debug/log.h"
  27. #include "runtime/rt.h"
  28. #include "single_op/task/build_task_utils.h"
  29. namespace ge {
  30. namespace {
  31. constexpr int kLaunchRetryTimes = 1000;
  32. constexpr size_t kMemcpyArgCount = 2;
  33. constexpr int kSleepTime = 10;
  34. constexpr uint64_t kReleaseFlag = 1;
  35. constexpr int kCopyNum = 2;
  36. constexpr uint64_t kInferSessionId = 0;
  37. void FreeHbm(void *var) {
  38. if (var) {
  39. (void)rtFree(var);
  40. }
  41. }
  42. } // namespace
  43. Status OpTask::OpenDump(rtStream_t stream) {
  44. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  45. GELOGI("Dump is open in single op, start to set dump info");
  46. std::vector<uint64_t> input_addrs;
  47. std::vector<uint64_t> output_adds;
  48. auto input_size = op_desc_->GetInputsSize();
  49. auto output_size = op_desc_->GetOutputsSize();
  50. uintptr_t *arg_base = nullptr;
  51. size_t arg_num = 0;
  52. GetIoAddr(arg_base, arg_num);
  53. if (arg_num < input_size + output_size) {
  54. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR,
  55. "[Check][Size]io_addrs_for_dump_ size %zu is not equal input and output size %zu",
  56. arg_num, input_size + output_size);
  57. REPORT_INNER_ERROR("E19999", "io_addrs_for_dump_ size %zu is not equal input and output size %zu",
  58. arg_num, input_size + output_size);
  59. return ACL_ERROR_GE_INTERNAL_ERROR;
  60. }
  61. for (size_t i = 0; i < input_size; i++) {
  62. uint64_t input_addr = arg_base[i];
  63. input_addrs.emplace_back(input_addr);
  64. }
  65. for (size_t j = 0; j < output_size; j++) {
  66. uint64_t output_addr = arg_base[input_size + j];
  67. output_adds.emplace_back(output_addr);
  68. }
  69. dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(kInferSessionId),
  70. op_desc_, input_addrs, output_adds, stream);
  71. auto status = dump_op_.LaunchDumpOp();
  72. if (status != SUCCESS) {
  73. GELOGE(status, "[Launch][DumpOp] failed in single op.");
  74. return status;
  75. }
  76. return SUCCESS;
  77. }
  78. GELOGI("Dump is not open in single op");
  79. return SUCCESS;
  80. }
  81. void TbeOpTask::SetStubFunc(const std::string &name, const void *stub_func) {
  82. this->stub_name_ = name;
  83. this->stub_func_ = stub_func;
  84. this->task_name_ = name;
  85. }
  86. void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim,
  87. const OpDescPtr &op_desc) {
  88. args_ = std::move(args);
  89. arg_size_ = arg_size;
  90. block_dim_ = block_dim;
  91. op_desc_ = op_desc;
  92. }
  93. void TbeOpTask::SetKernelWithHandleArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim,
  94. const OpDescPtr &op_desc,
  95. const domi::KernelDefWithHandle &kernel_def_with_handle) {
  96. SetKernelArgs(std::move(args), arg_size, block_dim, op_desc);
  97. original_kernel_key_ = kernel_def_with_handle.original_kernel_key();
  98. node_info_ = kernel_def_with_handle.node_info();
  99. }
  100. void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; }
  101. void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) {
  102. model_name_ = model_name;
  103. model_id_ = model_id;
  104. }
  105. Status OpTask::GetProfilingArgs(TaskDescInfo &task_desc_info, uint32_t &model_id) {
  106. uint32_t task_id = 0;
  107. uint32_t stream_id = 0;
  108. auto rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
  109. if (rt_ret != RT_ERROR_NONE) {
  110. GELOGE(RT_FAILED, "[Get][TaskIdAndStreamID] failed, ret: 0x%X.", rt_ret);
  111. REPORT_CALL_ERROR("E19999", "rtGetTaskIdAndStreamID failed, ret: 0x%X.", rt_ret);
  112. return RT_ERROR_TO_GE_STATUS(rt_ret);
  113. }
  114. GE_CHECK_NOTNULL(op_desc_);
  115. string op_name = op_desc_->GetName();
  116. GELOGD("Get profiling args of op [%s] end, task_id[%u], stream_id[%u].", op_name.c_str(), task_id, stream_id);
  117. model_id = model_id_;
  118. task_desc_info.model_name = model_name_;
  119. task_desc_info.block_dim = block_dim_;
  120. task_desc_info.task_id = task_id;
  121. task_desc_info.stream_id = stream_id;
  122. task_desc_info.op_name = op_name;
  123. task_desc_info.op_type = op_desc_->GetType();
  124. auto &prof_mgr = ProfilingManager::Instance();
  125. prof_mgr.GetOpInputOutputInfo(op_desc_, task_desc_info);
  126. return SUCCESS;
  127. }
  128. Status OpTask::UpdateRunInfo() {
  129. return UNSUPPORTED;
  130. }
  131. Status OpTask::DoUpdateArgTable(const SingleOpModelParam &param, bool keep_workspace) {
  132. auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, keep_workspace);
  133. auto all_addresses = BuildTaskUtils::JoinAddresses(addresses);
  134. uintptr_t *arg_base = nullptr;
  135. size_t arg_num = 0;
  136. GetIoAddr(arg_base, arg_num);
  137. if (arg_num < all_addresses.size()) {
  138. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR,
  139. "[Check][Size][%s] arg number mismatches, expect at least = %zu, but got = %zu.",
  140. op_desc_->GetName().c_str(), all_addresses.size(), arg_num);
  141. REPORT_INNER_ERROR("E19999", "%s arg number mismatches, expect at least = %zu, but got = %zu.",
  142. op_desc_->GetName().c_str(), all_addresses.size(), arg_num);
  143. return ACL_ERROR_GE_INTERNAL_ERROR;
  144. }
  145. for (void *addr : all_addresses) {
  146. *arg_base++ = reinterpret_cast<uintptr_t >(addr);
  147. }
  148. return SUCCESS;
  149. }
  150. Status OpTask::UpdateArgTable(const SingleOpModelParam &param) {
  151. return DoUpdateArgTable(param, true);
  152. }
  153. Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
  154. const vector<DataBuffer> &input_buffers,
  155. vector<GeTensorDesc> &output_desc,
  156. vector<DataBuffer> &output_buffers,
  157. rtStream_t stream) {
  158. return UNSUPPORTED;
  159. }
  160. const std::string &OpTask::GetTaskType() const { return kTaskTypeInvalid; }
  161. TbeOpTask::~TbeOpTask() {
  162. if (sm_desc_ != nullptr) {
  163. (void)rtMemFreeManaged(sm_desc_);
  164. }
  165. if (tiling_buffer_ != nullptr) {
  166. (void)rtFree(tiling_buffer_);
  167. }
  168. }
  169. const void *TbeOpTask::GetArgs() const { return args_.get(); }
  170. size_t TbeOpTask::GetArgSize() const { return arg_size_; }
  171. const std::string &TbeOpTask::GetStubName() const { return stub_name_; }
  172. const std::string &TbeOpTask::GetTaskType() const { return kTaskTypeAicore; }
  173. void TbeOpTask::SetHandle(void *handle) {
  174. this->handle_ = handle;
  175. }
  176. Status TbeOpTask::LaunchKernel(rtStream_t stream) {
  177. GELOGD("To invoke rtKernelLaunch. task = %s, block_dim = %u", this->stub_name_.c_str(), block_dim_);
  178. auto ret = DoLaunchKernel(stream);
  179. int retry_times = 0;
  180. while (ret != RT_ERROR_NONE && retry_times < kLaunchRetryTimes) {
  181. retry_times++;
  182. GELOGW("Retry after %d ms, retry_times: %d", kSleepTime, retry_times);
  183. std::this_thread::sleep_for(std::chrono::milliseconds(kSleepTime));
  184. ret = DoLaunchKernel(stream);
  185. }
  186. if (ret != RT_ERROR_NONE) {
  187. GELOGE(ret, "[Invoke][RtKernelLaunch] failed. ret = %d, task = %s", ret, this->stub_name_.c_str());
  188. REPORT_INNER_ERROR("E19999", "invoke rtKernelLaunch failed, ret = %d, task = %s", ret, this->stub_name_.c_str());
  189. return RT_ERROR_TO_GE_STATUS(ret);
  190. }
  191. GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());
  192. return SUCCESS;
  193. }
  194. Status TbeOpTask::CalcTilingInfo(optiling::utils::OpRunInfo &run_info) {
  195. auto ret = optiling::OpParaCalculateV2(*node_, run_info);
  196. if (ret != GRAPH_SUCCESS) {
  197. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Invoke][OpParaCalculate] failed, ret = %u.", ret);
  198. REPORT_INNER_ERROR("E19999", "invoke OpParaCalculate failed, ret = %u.", ret);
  199. return ACL_ERROR_GE_INTERNAL_ERROR;
  200. }
  201. return SUCCESS;
  202. }
  203. Status TbeOpTask::UpdateRunInfo() {
  204. // invoke OpParaCalculate
  205. GELOGD("Start to invoke OpParaCalculate.");
  206. optiling::utils::OpRunInfo run_info(0, true, 0);
  207. GE_CHK_STATUS_RET(CalcTilingInfo(run_info), "[Calc][TilingInfo]failed.");
  208. block_dim_ = run_info.GetBlockDim();
  209. tiling_data_ = run_info.GetAllTilingData().str();
  210. tiling_key_ = run_info.GetTilingKey();
  211. clear_atomic_ = run_info.GetClearAtomic();
  212. run_info.GetAllWorkspaces(run_info_workspaces_);
  213. GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu, tiling_key = %u", block_dim_,
  214. tiling_data_.size(), tiling_key_);
  215. return SUCCESS;
  216. }
  217. Status TbeOpTask::UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor) {
  218. int64_t storage_format_val = static_cast<Format>(FORMAT_RESERVED);
  219. (void)AttrUtils::GetInt(src_tensor, ge::ATTR_NAME_STORAGE_FORMAT, storage_format_val);
  220. auto storage_format = static_cast<Format>(storage_format_val);
  221. if (storage_format == FORMAT_RESERVED) {
  222. GELOGD("Storage format not set. update shape to [%s], and original shape to [%s]",
  223. src_tensor.GetShape().ToString().c_str(), src_tensor.GetOriginShape().ToString().c_str());
  224. dst_tensor.SetShape(src_tensor.GetShape());
  225. dst_tensor.SetOriginShape(src_tensor.GetOriginShape());
  226. } else {
  227. std::vector<int64_t> storage_shape;
  228. if (!AttrUtils::GetListInt(src_tensor, ge::ATTR_NAME_STORAGE_SHAPE, storage_shape)) {
  229. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][ListInt]failed while storage_format was set.");
  230. return ACL_ERROR_GE_INTERNAL_ERROR;
  231. }
  232. GELOGD("Storage format set. update shape to [%s], and original shape to [%s]",
  233. GeShape(storage_shape).ToString().c_str(), src_tensor.GetShape().ToString().c_str());
  234. dst_tensor.SetShape(GeShape(std::move(storage_shape)));
  235. dst_tensor.SetOriginShape(src_tensor.GetShape());
  236. }
  237. return SUCCESS;
  238. }
  239. Status TbeOpTask::UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
  240. auto op_desc = node_->GetOpDesc();
  241. GE_CHECK_NOTNULL(op_desc);
  242. // Set runtime shape to node
  243. for (size_t i = 0; i < input_desc.size(); ++i) {
  244. auto tensor_desc = op_desc->MutableInputDesc(i);
  245. auto &runtime_tensor_desc = input_desc[i];
  246. GE_CHECK_NOTNULL(tensor_desc);
  247. GE_CHK_STATUS_RET(UpdateTensorDesc(runtime_tensor_desc, *tensor_desc));
  248. }
  249. for (size_t i = 0; i < output_desc.size(); ++i) {
  250. auto tensor_desc = op_desc->MutableOutputDesc(i);
  251. auto &runtime_tensor_desc = output_desc[i];
  252. GE_CHECK_NOTNULL(tensor_desc);
  253. GE_CHK_STATUS_RET(UpdateTensorDesc(runtime_tensor_desc, *tensor_desc));
  254. }
  255. return SUCCESS;
  256. }
  257. Status TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, uint32_t max_tiling_size) {
  258. if (tiling_buffer != nullptr) {
  259. uintptr_t *arg_base = nullptr;
  260. size_t arg_num = 0;
  261. GetIoAddr(arg_base, arg_num);
  262. GE_CHECK_NOTNULL(node);
  263. GE_CHECK_NOTNULL(node->GetOpDesc());
  264. uint32_t inputs_num = node->GetOpDesc()->GetInputsSize();
  265. uint32_t outputs_num = node->GetOpDesc()->GetOutputsSize();
  266. uint32_t workspace_nums = node->GetOpDesc()->GetWorkspace().size();
  267. uint32_t tiling_index = inputs_num + outputs_num + workspace_nums;
  268. if (arg_num == 0 || arg_num < tiling_index) {
  269. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Size]Tiling index %u, arg number %zu is invalid.",
  270. tiling_index, arg_num);
  271. return ACL_ERROR_GE_INTERNAL_ERROR;
  272. }
  273. arg_base[tiling_index] = reinterpret_cast<uintptr_t>(tiling_buffer);
  274. }
  275. node_ = node;
  276. tiling_buffer_ = tiling_buffer;
  277. max_tiling_size_ = max_tiling_size;
  278. return SUCCESS;
  279. }
  280. Status TbeOpTask::AllocateWorkspaces(const vector<int64_t> &workspace_sizes) {
  281. static const std::string kPurpose("malloc workspace memory for dynamic op.");
  282. workspaces_.clear();
  283. if (workspace_sizes.empty()) {
  284. GELOGD("No need to allocate workspace.");
  285. return SUCCESS;
  286. }
  287. int64_t total_size = 0;
  288. std::vector<int64_t> ws_offsets;
  289. for (auto ws_size : workspace_sizes) {
  290. // alignment and padding should be done in OpParaCalculate
  291. if (CheckInt64AddOverflow(total_size, ws_size) != SUCCESS) {
  292. return ACL_ERROR_GE_INTERNAL_ERROR;
  293. }
  294. ws_offsets.emplace_back(total_size);
  295. total_size += ws_size;
  296. }
  297. GELOGD("Total workspace size is %ld", total_size);
  298. GE_CHECK_NOTNULL(stream_resource_);
  299. auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast<size_t>(total_size));
  300. if (ws_base == nullptr) {
  301. GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Malloc][Memory] failed, size: %ld", total_size);
  302. REPORT_INNER_ERROR("E19999", "MallocMemory failed, size: %ld", total_size);
  303. return ACL_ERROR_GE_MEMORY_ALLOCATION;
  304. }
  305. GELOGD("Done allocating workspace memory successfully.");
  306. for (auto ws_offset : ws_offsets) {
  307. workspaces_.emplace_back(ws_base + ws_offset);
  308. }
  309. return SUCCESS;
  310. }
  311. Status TbeOpTask::CheckAndExecuteAtomic(const vector<GeTensorDesc> &input_desc,
  312. const vector<DataBuffer> &input_buffers,
  313. vector<GeTensorDesc> &output_desc,
  314. vector<DataBuffer> &output_buffers,
  315. rtStream_t stream) {
  316. if (clear_atomic_ && atomic_task_ != nullptr) {
  317. return atomic_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream);
  318. }
  319. return SUCCESS;
  320. }
  321. Status TbeOpTask::UpdateTilingArgs(rtStream_t stream) {
  322. size_t args_size = input_num_ + output_num_ + workspaces_.size();
  323. if (tiling_buffer_ != nullptr) {
  324. args_size++;
  325. }
  326. size_t temp_size = args_size * sizeof(void *);
  327. if (arg_size_ < temp_size) {
  328. GELOGD("Need to reset size of args_ from %zu to %zu.", arg_size_, temp_size);
  329. std::unique_ptr<uint8_t[]> args(new (std::nothrow) uint8_t[temp_size]());
  330. GE_CHECK_NOTNULL(args);
  331. if (memcpy_s(args.get(), temp_size, args_.get(), arg_size_) != EOK) {
  332. GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][KernelArgs] failed for [%s].", node_->GetName().c_str());
  333. REPORT_INNER_ERROR("E19999", "update kernel args failed for %s.", node_->GetName().c_str());
  334. return ACL_ERROR_GE_MEMORY_OPERATE_FAILED;
  335. }
  336. args_ = std::move(args);
  337. arg_size_ = temp_size;
  338. }
  339. uintptr_t *arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  340. size_t arg_index = input_num_ + output_num_;
  341. for (size_t i = 0; i < workspaces_.size(); ++i) {
  342. arg_base[arg_index++] = reinterpret_cast<uintptr_t>(workspaces_[i]);
  343. }
  344. if (tiling_buffer_ != nullptr) {
  345. GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size());
  346. GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_, max_tiling_size_, tiling_data_.data(), tiling_data_.size(),
  347. RT_MEMCPY_HOST_TO_DEVICE_EX, stream));
  348. arg_base[arg_index] = reinterpret_cast<uintptr_t>(tiling_buffer_);
  349. }
  350. return SUCCESS;
  351. }
  352. Status TbeOpTask::SetArgIndex() {
  353. const vector<bool> v_is_input_const = op_desc_->GetIsInputConst();
  354. size_t input_index = 0;
  355. for (size_t i = 0; i < op_desc_->GetAllInputsSize(); ++i) {
  356. const GeTensorDescPtr tensor_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(i));
  357. if (tensor_desc == nullptr) {
  358. GELOGD("SingleOp: %s, Index: %zu, has no input", op_desc_->GetName().c_str(), i);
  359. continue;
  360. }
  361. if (i < v_is_input_const.size() && v_is_input_const[i]) {
  362. GELOGD("SingleOp: %s, Index: %zu, input is const", op_desc_->GetName().c_str(), i);
  363. input_index++;
  364. continue;
  365. }
  366. arg_index_.emplace_back(input_index);
  367. input_index++;
  368. }
  369. return SUCCESS;
  370. }
  371. Status TbeOpTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) {
  372. if (arg_index_.size() != inputs.size()) {
  373. GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size] Args size is %zu, but get input size is %zu.",
  374. arg_index_.size(), inputs.size());
  375. REPORT_INNER_ERROR("E19999", "[Check][Size] Args size is %zu, but get input size is %zu.",
  376. arg_index_.size(), inputs.size());
  377. return ACL_ERROR_GE_PARAM_INVALID;
  378. }
  379. uintptr_t *arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  380. for (size_t i = 0; i < arg_index_.size(); ++i) {
  381. arg_base[arg_index_[i]] = reinterpret_cast<uintptr_t>(inputs[i].data);
  382. }
  383. for (size_t i = 0; i < op_desc_->GetOutputsSize(); ++i) {
  384. arg_base[input_num_ + i] = reinterpret_cast<uintptr_t>(outputs[i].data);
  385. }
  386. return SUCCESS;
  387. }
  388. Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
  389. const vector<DataBuffer> &input_buffers,
  390. vector<GeTensorDesc> &output_desc,
  391. vector<DataBuffer> &output_buffers,
  392. rtStream_t stream) {
  393. GELOGD("[%s] Start to launch kernel", node_->GetName().c_str());
  394. GE_CHK_STATUS_RET(UpdateIoAddr(input_buffers, output_buffers), "[Update][IoAddr] failed.");
  395. GE_CHK_STATUS_RET_NOLOG(UpdateNodeByShape(input_desc, output_desc));
  396. GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo());
  397. GE_CHK_STATUS_RET(AllocateWorkspaces(run_info_workspaces_), "[Allocate][Workspaces] failed.");
  398. GE_CHK_STATUS_RET(CheckAndExecuteAtomic(input_desc, input_buffers, output_desc, output_buffers, stream),
  399. "[Execute][AtomicTask] failed.");
  400. GE_CHK_STATUS_RET(UpdateTilingArgs(stream), "[Update][TilingArgs] failed.");
  401. GELOGD("[%s] Start to invoke rtKernelLaunch", node_->GetName().c_str());
  402. GE_CHK_STATUS_RET(DoLaunchKernel(stream), "Failed to do launch kernel.");
  403. return SUCCESS;
  404. }
  405. Status TbeOpTask::DoLaunchKernel(rtStream_t stream) {
  406. auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
  407. if (handle_ == nullptr) {
  408. GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), static_cast<uint32_t>(arg_size_),
  409. sm_desc, stream));
  410. } else {
  411. std::string dev_func = original_kernel_key_ + "_" + std::to_string(tiling_key_);
  412. std::string kernel_info = node_info_ + "/" + std::to_string(tiling_key_);
  413. GE_CHK_RT_RET(rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, args_.get(),
  414. static_cast<uint32_t>(arg_size_), sm_desc, stream, kernel_info.c_str()));
  415. }
  416. return SUCCESS;
  417. }
  418. void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  419. arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  420. arg_count = arg_size_ / sizeof(void *);
  421. if (tiling_buffer_ != nullptr) {
  422. --arg_count;
  423. }
  424. }
  425. Status AtomicAddrCleanOpTask::UpdateNodeByShape(const vector<GeTensorDesc> &input_desc,
  426. const vector<GeTensorDesc> &output_desc) {
  427. return SUCCESS;
  428. }
  429. Status AtomicAddrCleanOpTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) {
  430. uintptr_t *arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  431. for (auto atomic_output_index : atomic_output_indices_) {
  432. if (atomic_output_index >= static_cast<int>(outputs.size())) {
  433. GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Update][Args] failed, atomic index must smaller then data size.");
  434. REPORT_INNER_ERROR("E19999", "[Update][Args] failed, atomic index must smaller then data size.");
  435. return ACL_ERROR_GE_PARAM_INVALID;
  436. }
  437. auto &output_buffer = outputs[atomic_output_index];
  438. *arg_base++ = reinterpret_cast<uintptr_t>(output_buffer.data);
  439. auto tensor_desc = op_desc_->MutableOutputDesc(atomic_output_index);
  440. int64_t size = 0;
  441. graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, size);
  442. if (graph_status != GRAPH_SUCCESS) {
  443. REPORT_CALL_ERROR("E19999", "Get tensor size in bytes failed!");
  444. GELOGE(graph_status, "[Get][TensorMemorySize] In Bytes failed!");
  445. return FAILED;
  446. }
  447. TensorUtils::SetSize(*tensor_desc, size);
  448. }
  449. return SUCCESS;
  450. }
  451. Status AtomicAddrCleanOpTask::UpdateTilingArgs(rtStream_t stream) {
  452. if (tiling_buffer_ != nullptr) {
  453. GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size());
  454. GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_, max_tiling_size_, tiling_data_.data(), tiling_data_.size(),
  455. RT_MEMCPY_HOST_TO_DEVICE_EX, stream));
  456. uintptr_t *arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  457. size_t idx = atomic_output_indices_.size();
  458. arg_base[idx] = reinterpret_cast<uintptr_t>(tiling_buffer_);
  459. }
  460. return SUCCESS;
  461. }
  462. Status AtomicAddrCleanOpTask::CalcTilingInfo(optiling::utils::OpRunInfo &run_info) {
  463. auto ret = optiling::OpAtomicCalculateV2(*node_, run_info);
  464. if (ret != GRAPH_SUCCESS) {
  465. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Invoke][OpAtomicCalculate] failed, ret = %u.", ret);
  466. REPORT_INNER_ERROR("E19999", "invoke OpAtomicCalculate failed, ret = %u.", ret);
  467. return ACL_ERROR_GE_INTERNAL_ERROR;
  468. }
  469. return SUCCESS;
  470. }
  471. Status AtomicAddrCleanOpTask::InitAtomicAddrCleanIndices() {
  472. GELOGD("[%s] Start to setup AtomicAddrClean task.", op_desc_->GetName().c_str());
  473. std::vector<int64_t> atomic_output_indices;
  474. (void) ge::AttrUtils::GetListInt(op_desc_, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_indices);
  475. if (atomic_output_indices.empty()) {
  476. GELOGE(INTERNAL_ERROR, "[Check][Size][%s] atomic_output_indices must not be empty.", op_desc_->GetName().c_str());
  477. REPORT_INNER_ERROR("E19999", "[%s] atomic_output_indices must not be empty.", op_desc_->GetName().c_str());
  478. return INTERNAL_ERROR;
  479. }
  480. size_t max_arg_size = tiling_buffer_ == nullptr ? arg_size_ : arg_size_ - 1;
  481. if (atomic_output_indices.size() > max_arg_size) {
  482. GELOGE(INTERNAL_ERROR, "[Check][Size][%s] atomic_output_indices invalid. atomic_output_indices size is %zu,"
  483. "arg size is %zu.", op_desc_->GetName().c_str(), atomic_output_indices.size(), arg_size_);
  484. REPORT_INNER_ERROR("E19999", "[%s] atomic_output_indices invalid. atomic_output_indices size is %zu,"
  485. "arg size is %zu.", op_desc_->GetName().c_str(), atomic_output_indices.size(), arg_size_);
  486. return INTERNAL_ERROR;
  487. }
  488. for (auto output_index : atomic_output_indices) {
  489. GELOGD("[%s] Adding output index [%ld]", op_desc_->GetName().c_str(), output_index);
  490. GE_CHECK_GE(output_index, 0);
  491. GE_CHECK_LE(output_index, INT32_MAX);
  492. atomic_output_indices_.emplace_back(static_cast<int>(output_index));
  493. }
  494. return SUCCESS;
  495. }
  496. AiCpuBaseTask::~AiCpuBaseTask() {
  497. if (ext_info_addr_dev_ != nullptr) {
  498. (void)rtFree(ext_info_addr_dev_);
  499. }
  500. if (rt_event_ != nullptr) {
  501. (void)rtEventDestroy(rt_event_);
  502. }
  503. FreeHbm(copy_input_release_flag_dev_);
  504. FreeHbm(copy_input_data_size_dev_);
  505. FreeHbm(copy_input_src_dev_);
  506. FreeHbm(copy_input_dst_dev_);
  507. for (auto summary : output_summary_) {
  508. FreeHbm(summary);
  509. }
  510. for (auto out_shape : out_shape_hbm_) {
  511. FreeHbm(out_shape);
  512. }
  513. }
  514. Status AiCpuBaseTask::UpdateEventIdForBlockingAicpuOp() {
  515. bool is_support = false;
  516. if (CheckDeviceSupportBlockingAicpuOpProcess(is_support) != SUCCESS) {
  517. GELOGE(FAILED, "[Call][CheckDeviceSupportBlockingAicpuOpProcess] Call CheckDeviceSupportBlockingAicpuOpProcess failed");
  518. return FAILED;
  519. }
  520. if (!is_support) {
  521. GELOGD("Device not support blocking aicpu op process");
  522. return SUCCESS;
  523. }
  524. uint32_t event_id = 0;
  525. auto rt_ret = rtEventCreateWithFlag(&rt_event_, RT_EVENT_WITH_FLAG);
  526. if (rt_ret != RT_ERROR_NONE) {
  527. REPORT_CALL_ERROR("E19999", "Call rtEventCreateWithFlag failed, ret:0x%X", rt_ret);
  528. GELOGE(RT_FAILED, "[Call][rtEventCreateWithFlag] failed, ret:0x%X", rt_ret);
  529. return RT_ERROR_TO_GE_STATUS(rt_ret);
  530. }
  531. rt_ret = rtGetEventID(rt_event_, &event_id);
  532. if (rt_ret != RT_ERROR_NONE) {
  533. REPORT_CALL_ERROR("E19999", "Call rtGetEventID failed, ret:0x%X", rt_ret);
  534. GELOGE(RT_FAILED, "[Call][rtGetEventID] failed, ret:0x%X", rt_ret);
  535. return RT_ERROR_TO_GE_STATUS(rt_ret);
  536. }
  537. if (aicpu_ext_handle_->UpdateEventId(event_id) != SUCCESS) {
  538. REPORT_CALL_ERROR("E19999", "Update event id=%u failed.", event_id);
  539. GELOGE(FAILED, "[Update][EventId] Update event id failed", event_id);
  540. return FAILED;
  541. }
  542. GELOGI("Update event_id=%u success", event_id);
  543. return SUCCESS;
  544. }
  545. Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id) {
  546. if (kernel_ext_info.empty()) {
  547. GELOGI("Kernel_ext_info is empty, no need copy to device.");
  548. return SUCCESS;
  549. }
  550. int32_t unknown_shape_type_val = 0;
  551. (void) AttrUtils::GetInt(op_desc_, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
  552. GELOGD("Get unknown_type is %d.", unknown_shape_type_val);
  553. unknown_type_ = static_cast<UnknowShapeOpType>(unknown_shape_type_val);
  554. AttrUtils::GetBool(op_desc_, ATTR_NAME_IS_BLOCKING_OP, is_blocking_aicpu_op_);
  555. GELOGD("Get op:%s attribute(is_blocking_op), value:%d", op_desc_->GetName().c_str(), is_blocking_aicpu_op_);
  556. aicpu_ext_handle_.reset(new(std::nothrow) ::ge::hybrid::AicpuExtInfoHandler(op_desc_->GetName(),
  557. num_inputs_,
  558. num_outputs_,
  559. unknown_type_));
  560. GE_CHK_BOOL_RET_STATUS(aicpu_ext_handle_ != nullptr, ACL_ERROR_GE_MEMORY_ALLOCATION,
  561. "[Malloc][Memory] failed for aicpu_ext_handle!");
  562. Status ret = aicpu_ext_handle_->Parse(kernel_ext_info);
  563. if (ret != SUCCESS) {
  564. GELOGE(ret, "[Parse][Param:kernel_ext_info] failed, kernel_ext_info_size=%zu.", kernel_ext_info.size());
  565. REPORT_INNER_ERROR("E19999",
  566. "Parse Param:kernel_ext_info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size());
  567. return ret;
  568. }
  569. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateSessionInfo(ULLONG_MAX, kernel_id, false),
  570. "[Update][SessionInfo] failed.");
  571. if (is_blocking_aicpu_op_) {
  572. if (UpdateEventIdForBlockingAicpuOp() != SUCCESS) {
  573. GELOGE(FAILED, "[Call][UpdateEventIdForBlockingAicpuOp] Call UpdateEventIdForBlockingAicpuOp failed");
  574. return FAILED;
  575. }
  576. }
  577. GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(), RT_MEMORY_HBM));
  578. GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(),
  579. aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(),
  580. RT_MEMCPY_HOST_TO_DEVICE));
  581. return SUCCESS;
  582. }
  583. Status AiCpuBaseTask::SetInputConst() {
  584. input_is_const_.clear();
  585. const vector<bool> v_is_input_const = op_desc_->GetIsInputConst();
  586. for (size_t i = 0; i < op_desc_->GetAllInputsSize(); ++i) {
  587. const GeTensorDescPtr tensor_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(i));
  588. if (tensor_desc == nullptr) {
  589. GELOGD("SingleOp: %s, Index: %zu, has no input", op_desc_->GetName().c_str(), i);
  590. continue;
  591. }
  592. if (i < v_is_input_const.size() && v_is_input_const[i]) {
  593. GELOGD("SingleOp: %s, Index: %zu, input is const", op_desc_->GetName().c_str(), i);
  594. input_is_const_.push_back(true);
  595. continue;
  596. }
  597. input_is_const_.push_back(false);
  598. }
  599. return SUCCESS;
  600. }
  601. Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
  602. std::vector<GeTensorDesc> &output_desc,
  603. rtStream_t stream) {
  604. GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_);
  605. GE_CHECK_NOTNULL(aicpu_ext_handle_);
  606. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(false), "[Update][ExecuteMode] failed.");
  607. if (num_inputs_ == 0 && num_outputs_ == 0) {
  608. GELOGI("No input and output, no need update ext info.");
  609. return SUCCESS;
  610. }
  611. size_t non_const_index = 0;
  612. for (size_t input_index = 0; input_index < num_inputs_; input_index++) {
  613. if (input_index < input_is_const_.size() && input_is_const_[input_index]) {
  614. // get input_desc from op_desc if const input, num_inputs_ is op_desc_ input_size
  615. auto const_input_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(input_index));
  616. GE_CHECK_NOTNULL(const_input_desc);
  617. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, *const_input_desc),
  618. "[Update][InputShapeAndType] failed, input_index:%zu.", input_index);
  619. continue;
  620. }
  621. GE_CHK_BOOL_RET_STATUS(non_const_index < input_desc.size(), ACL_ERROR_GE_PARAM_INVALID,
  622. "[Check][Size]Input_desc size is %zu, but get non_const_index is %zu", input_desc.size(), non_const_index);
  623. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, input_desc[non_const_index]),
  624. "[Update][InputShapeAndType]failed, input_index:%zu.", input_index);
  625. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  626. GE_CHK_STATUS_RET(op_desc_->UpdateInputDesc(input_index, input_desc[non_const_index]),
  627. "AiCpuTask Update [%zu]th input desc failed.",input_index);
  628. }
  629. non_const_index++;
  630. }
  631. if (unknown_type_ != DEPEND_COMPUTE) {
  632. for (size_t j = 0; j < num_outputs_; ++j) {
  633. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]),
  634. "[Update][OutputShapeAndType] failed, Output:%zu.", j);
  635. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  636. GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(j, output_desc[j]),
  637. "AiCpuTask Update [%zu]th output desc failed.",j);
  638. }
  639. }
  640. }
  641. GE_CHK_RT_RET(rtMemcpyAsync(ext_info_addr_dev_,
  642. aicpu_ext_handle_->GetExtInfoLen(), // check size
  643. aicpu_ext_handle_->GetExtInfo(),
  644. aicpu_ext_handle_->GetExtInfoLen(),
  645. RT_MEMCPY_HOST_TO_DEVICE_EX,
  646. stream));
  647. GELOGI("Update ext info end.");
  648. return SUCCESS;
  649. }
  650. Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) {
  651. if (num_outputs_ == 0) {
  652. GELOGD("AiCpuBaseTask output_num is 0, no need update output shape.");
  653. return SUCCESS;
  654. }
  655. GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape.");
  656. GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), ext_info_addr_dev_,
  657. aicpu_ext_handle_->GetExtInfoLen(), RT_MEMCPY_DEVICE_TO_HOST));
  658. for (size_t i = 0; i < num_outputs_; ++i) {
  659. GeShape shape;
  660. DataType data_type;
  661. aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type);
  662. GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]),
  663. "[Update][ShapeToOutputDesc] failed, output:%zu.", i);
  664. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  665. GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "[Update][OutputDesc] failed, output:%zu.", i);
  666. }
  667. }
  668. GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished.");
  669. return SUCCESS;
  670. }
  671. Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc) {
  672. auto shape_old = output_desc.GetShape();
  673. output_desc.SetShape(shape_new);
  674. GELOGD("Update AiCpuBaseTask shape from %s to %s", shape_old.ToString().c_str(), shape_new.ToString().c_str());
  675. auto origin_shape_old = output_desc.GetOriginShape();
  676. auto origin_format = output_desc.GetOriginFormat();
  677. auto format = output_desc.GetFormat();
  678. if (origin_format == format) {
  679. output_desc.SetOriginShape(shape_new);
  680. return SUCCESS;
  681. }
  682. std::vector<int64_t> origin_dims_new;
  683. auto trans_ret = formats::TransShape(format, shape_new.GetDims(),
  684. output_desc.GetDataType(), origin_format, origin_dims_new);
  685. GE_CHK_STATUS_RET(trans_ret,
  686. "[Trans][Shape] failed, AiCpuTask originFormat[%d] is not same as format[%d], shape=%s.",
  687. origin_format, format, shape_new.ToString().c_str());
  688. auto origin_shape_new = GeShape(origin_dims_new);
  689. output_desc.SetOriginShape(origin_shape_new);
  690. GELOGD("AiCpuTask originFormat[%d] is not same as format[%d], need update from %s ro %s.",
  691. origin_format, format, origin_shape_old.ToString().c_str(), origin_shape_new.ToString().c_str());
  692. return SUCCESS;
  693. }
  694. Status AiCpuBaseTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) {
  695. uintptr_t *arg_base = nullptr;
  696. size_t arg_num = 0;
  697. GetIoAddr(arg_base, arg_num);
  698. // input number and output number was check in ValidateParams
  699. size_t non_const_index = 0;
  700. for (size_t input_index = 0; input_index < num_inputs_; input_index++) {
  701. if (input_index < input_is_const_.size() && input_is_const_[input_index]) {
  702. // const input no need update addr
  703. GE_CHECK_NOTNULL(arg_base);
  704. GELOGD("AICpuTask input[%zu] addr = %lu", input_index, *arg_base);
  705. arg_base++;
  706. continue;
  707. }
  708. GE_CHK_BOOL_RET_STATUS(non_const_index < inputs.size(), ACL_ERROR_GE_PARAM_INVALID,
  709. "[Check][Size] Input size is %zu, but get non_const_index is %zu", inputs.size(), non_const_index);
  710. auto addr = inputs[non_const_index].data;
  711. uint64_t length = inputs[non_const_index].length;
  712. if (length != 0 && addr == nullptr) {
  713. GELOGE(PARAM_INVALID, "[Check][Addr]AiCpuTask input[%zu] addr is nullptr, length = %lu", input_index, length);
  714. return PARAM_INVALID;
  715. }
  716. GELOGD("AICpuTask input[%zu] addr = %p, length = %lu.", input_index, addr, length);
  717. *arg_base++ = reinterpret_cast<uintptr_t>(addr);
  718. non_const_index++;
  719. }
  720. for (size_t i = 0; i < outputs.size(); ++i) {
  721. auto addr = outputs[i].data;
  722. uint64_t length = outputs[i].length;
  723. if (length != 0 && addr == nullptr) {
  724. GELOGE(PARAM_INVALID, "[Check][Addr]AiCpuTask output[%zu] addr is nullptr, length = %lu", i, length);
  725. return PARAM_INVALID;
  726. }
  727. GELOGD("AICpuTask output[%zu] addr = %p, length = %lu.", i, addr, length);
  728. *arg_base++ = reinterpret_cast<uintptr_t>(addr);
  729. }
  730. return SUCCESS;
  731. }
  732. Status AiCpuBaseTask::CheckDeviceSupportBlockingAicpuOpProcess(bool &is_support) {
  733. int32_t device_id = 0;
  734. auto rt_ret = rtGetDevice(&device_id);
  735. if (rt_ret != RT_ERROR_NONE) {
  736. REPORT_CALL_ERROR("E19999", "Call rtGetDevice failed, ret:0x%X", rt_ret);
  737. GELOGE(RT_FAILED, "[Call][rtGetDevice] failed, ret:0x%X", rt_ret);
  738. return RT_ERROR_TO_GE_STATUS(rt_ret);
  739. }
  740. int32_t value = 0;
  741. rt_ret = rtGetDeviceCapability(device_id, FEATURE_TYPE_BLOCKING_OPERATOR, RT_MODULE_TYPE_AICPU, &value);
  742. if (rt_ret != RT_ERROR_NONE) {
  743. REPORT_CALL_ERROR("E19999", "Call rtGetDeviceCapability failed, ret:0x%X", rt_ret);
  744. GELOGE(RT_FAILED, "[Call][rtGetDeviceCapability] failed, ret:0x%X", rt_ret);
  745. return RT_ERROR_TO_GE_STATUS(rt_ret);
  746. }
  747. if (value != RT_AICPU_BLOCKING_OP_NOT_SUPPORT && value != RT_AICPU_BLOCKING_OP_SUPPORT) {
  748. REPORT_INNER_ERROR("E19999", "Value should be %d or %d but %d",
  749. RT_AICPU_BLOCKING_OP_NOT_SUPPORT, RT_AICPU_BLOCKING_OP_SUPPORT, value);
  750. GELOGE(FAILED, "[Check][Value] Value should be %d or %d but %d",
  751. RT_AICPU_BLOCKING_OP_NOT_SUPPORT, RT_AICPU_BLOCKING_OP_SUPPORT, value);
  752. return FAILED;
  753. }
  754. is_support = (value == RT_AICPU_BLOCKING_OP_SUPPORT ? true : false);
  755. return SUCCESS;
  756. }
  757. Status AiCpuBaseTask::DistributeWaitTaskForAicpuBlockingOp(rtStream_t stream) {
  758. bool is_support = false;
  759. if (CheckDeviceSupportBlockingAicpuOpProcess(is_support) != SUCCESS) {
  760. GELOGE(FAILED, "[Call][CheckDeviceSupportBlockingAicpuOpProcess] Call CheckDeviceSupportBlockingAicpuOpProcess failed");
  761. return FAILED;
  762. }
  763. if (!is_support) {
  764. GELOGD("Device not support blocking aicpu op process.");
  765. return SUCCESS;
  766. }
  767. GELOGI("Distribute queue task begin");
  768. if (rt_event_ == nullptr) {
  769. REPORT_INNER_ERROR("E19999", "rt_event_ is nullptr");
  770. GELOGE(FAILED, "[Check][rt_event_] rt_event_ is nullptr");
  771. return FAILED;
  772. }
  773. auto rt_ret = rtStreamWaitEvent(stream, rt_event_);
  774. if (rt_ret != RT_ERROR_NONE) {
  775. REPORT_CALL_ERROR("E19999", "Call rtStreamWaitEvent failed, ret:0x%X", rt_ret);
  776. GELOGE(RT_FAILED, "[Call][RtApi] failed, ret:0x%X", rt_ret);
  777. return RT_ERROR_TO_GE_STATUS(rt_ret);
  778. }
  779. rt_ret = rtEventReset(rt_event_, stream);
  780. if (rt_ret != RT_ERROR_NONE) {
  781. REPORT_CALL_ERROR("E19999", "Call rtEventReset failed, ret:0x%X", rt_ret);
  782. GELOGE(RT_FAILED, "[Call][RtApi] failed, ret:0x%X", rt_ret);
  783. return RT_ERROR_TO_GE_STATUS(rt_ret);
  784. }
  785. return SUCCESS;
  786. }
  787. AiCpuTask::~AiCpuTask() {
  788. FreeHbm(args_);
  789. FreeHbm(io_addr_);
  790. FreeHbm(workspace_addr_);
  791. FreeHbm(copy_workspace_buf_);
  792. FreeHbm(copy_ioaddr_dev_);
  793. FreeHbm(copy_task_args_buf_);
  794. }
  795. Status AiCpuTask::LaunchKernel(rtStream_t stream) {
  796. GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str());
  797. auto ret = rtMemcpyAsync(io_addr_,
  798. io_addr_size_,
  799. io_addr_host_.data(),
  800. io_addr_host_.size() * sizeof(void *),
  801. RT_MEMCPY_HOST_TO_DEVICE_EX,
  802. stream);
  803. if (ret != RT_ERROR_NONE) {
  804. GELOGE(ret, "[MemcpyAsync][Date] failed. ret = %d, task = %s", ret, this->op_type_.c_str());
  805. REPORT_CALL_ERROR("E19999", "rtMemcpyAsync data failed, ret = %d, task = %s", ret, this->op_type_.c_str());
  806. return RT_ERROR_TO_GE_STATUS(ret);
  807. }
  808. GELOGI("To invoke rtKernelLaunchEx. task = %s", this->op_type_.c_str());
  809. ret = rtKernelLaunchEx(args_, arg_size_, 0, stream);
  810. if (ret != RT_ERROR_NONE) {
  811. GELOGE(ret, "[Invoke][rtKernelLaunch] failed. ret = %d, task = %s", ret, this->op_type_.c_str());
  812. REPORT_CALL_ERROR("E19999", "invoke rtKernelLaunchEx failed, ret = %d, task = %s", ret, this->op_type_.c_str());
  813. return RT_ERROR_TO_GE_STATUS(ret);
  814. }
  815. GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str());
  816. GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str());
  817. if (is_blocking_aicpu_op_) {
  818. if (DistributeWaitTaskForAicpuBlockingOp(stream) != SUCCESS) {
  819. GELOGE(FAILED, "[Call][DistributeWaitTaskForAicpuBlockingOp] Call DistributeWaitTaskForAicpuBlockingOp failed");
  820. return FAILED;
  821. }
  822. }
  823. return SUCCESS;
  824. }
  825. Status AiCpuBaseTask::PrepareCopyInputs(vector<DataBuffer> &outputs) {
  826. std::vector<uint64_t> copy_input_release_flag;
  827. std::vector<uint64_t> copy_input_data_size;
  828. std::vector<uint64_t> copy_input_src;
  829. std::vector<uint64_t> copy_input_dst;
  830. for (size_t i = 0; i < num_outputs_; ++i) {
  831. const auto &summary = output_summary_host_[i];
  832. GELOGI("Node out[%zu] summary, shape data=0x%lx, shape data size=%lu, raw data=0x%lx, raw data size=%lu.",
  833. i, summary.shape_data_ptr, summary.shape_data_size,
  834. summary.raw_data_ptr, summary.raw_data_size);
  835. auto output = outputs[i];
  836. copy_input_release_flag.emplace_back(kReleaseFlag);
  837. if (summary.raw_data_size > 0) {
  838. copy_input_data_size.emplace_back(output.length);
  839. } else {
  840. copy_input_data_size.emplace_back(summary.raw_data_size);
  841. }
  842. copy_input_src.emplace_back(summary.raw_data_ptr);
  843. copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output.data));
  844. const auto &shape_buffer = out_shape_hbm_[i];
  845. copy_input_release_flag.emplace_back(kReleaseFlag);
  846. copy_input_data_size.emplace_back(summary.shape_data_size);
  847. copy_input_src.emplace_back(summary.shape_data_ptr);
  848. copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(shape_buffer));
  849. }
  850. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  851. GE_CHK_RT_RET(rtMemcpy(copy_input_release_flag_dev_, copy_input_buf_len,
  852. copy_input_release_flag.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  853. GE_CHK_RT_RET(rtMemcpy(copy_input_data_size_dev_, copy_input_buf_len,
  854. copy_input_data_size.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  855. GE_CHK_RT_RET(rtMemcpy(copy_input_src_dev_, copy_input_buf_len,
  856. copy_input_src.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  857. GE_CHK_RT_RET(rtMemcpy(copy_input_dst_dev_, copy_input_buf_len,
  858. copy_input_dst.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  859. return SUCCESS;
  860. }
  861. Status AiCpuBaseTask::ReadResultSummaryAndPrepareMemory() {
  862. for (size_t i = 0; i < num_outputs_; ++i) {
  863. auto &result_summary = output_summary_host_[i];
  864. GE_CHK_RT_RET(rtMemcpy(&result_summary, sizeof(aicpu::FWKAdapter::ResultSummary),
  865. output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary),
  866. RT_MEMCPY_DEVICE_TO_HOST));
  867. auto shape_data_size = result_summary.shape_data_size;
  868. void *shape_buffer = nullptr;
  869. if (shape_data_size > 0) {
  870. GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM));
  871. }
  872. out_shape_hbm_.emplace_back(shape_buffer);
  873. }
  874. return SUCCESS;
  875. }
  876. Status AiCpuCCTask::CopyDataToHbm(vector<DataBuffer> &outputs,
  877. rtStream_t stream) {
  878. GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs));
  879. auto ret = rtCpuKernelLaunchWithFlag(static_cast<const void *>(memcpy_so_name_.data()),
  880. static_cast<const void *>(memcpy_kernel_name_.data()),
  881. block_dim_, memcpy_args_.get(), static_cast<uint32_t>(memcpy_args_size_),
  882. nullptr, stream, RT_KERNEL_DEFAULT);
  883. GE_CHK_RT_RET(ret);
  884. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  885. return SUCCESS;
  886. }
  887. Status AiCpuTask::CopyDataToHbm(vector<DataBuffer> &outputs,
  888. rtStream_t stream) {
  889. GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs));
  890. GE_CHK_RT_RET(rtKernelLaunchEx(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL),
  891. RT_KERNEL_DEFAULT, stream));
  892. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  893. return SUCCESS;
  894. }
  895. Status AiCpuBaseTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) {
  896. for (size_t i = 0; i < num_outputs_; ++i) {
  897. const auto &result_summary = output_summary_host_[i];
  898. std::vector<int64_t> shape_dims;
  899. if (result_summary.shape_data_size > 0) {
  900. const auto &shape_hbm = out_shape_hbm_[i];
  901. uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
  902. std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
  903. GE_CHECK_NOTNULL(shape_addr);
  904. GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm,
  905. result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));
  906. for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
  907. shape_dims.emplace_back(shape_addr[dim_idx]);
  908. GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]);
  909. }
  910. }
  911. GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
  912. "[Update][ShapeToOutputDesc] failed , output:%zu.", i);
  913. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  914. GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "[Update][OutputDesc] failed, output:%zu.", i);
  915. }
  916. }
  917. return SUCCESS;
  918. }
  919. Status AiCpuBaseTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
  920. vector<DataBuffer> &outputs,
  921. rtStream_t stream) {
  922. if (num_outputs_ == 0) {
  923. GELOGI("Output num is 0, there is no need to update the output and size.");
  924. return SUCCESS;
  925. }
  926. GELOGI("Update shape and data by result summary begin.");
  927. for (auto out_shape : out_shape_hbm_) {
  928. FreeHbm(out_shape);
  929. }
  930. out_shape_hbm_.clear();
  931. GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(),
  932. "[Read][ResultSummaryAndPrepareMemory] failed.");
  933. GE_CHK_STATUS_RET(CopyDataToHbm(outputs, stream),
  934. "[Copy][DataToHbm] failed.");
  935. GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc),
  936. "[Update][ShapeByHbmBuffer] failed.");
  937. for (auto out_shape : out_shape_hbm_) {
  938. FreeHbm(out_shape);
  939. }
  940. out_shape_hbm_.clear();
  941. GELOGI("Update shape and data by result summary end.");
  942. return SUCCESS;
  943. }
  944. Status AiCpuTask::InitForSummaryAndCopy() {
  945. if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
  946. GELOGI("Unknown_type is %d, output num is %zu.", unknown_type_, num_outputs_);
  947. return SUCCESS;
  948. }
  949. output_summary_.resize(num_outputs_);
  950. constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
  951. for (size_t i = 0; i < num_outputs_; ++i) {
  952. GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
  953. }
  954. output_summary_host_.resize(num_outputs_);
  955. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  956. GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  957. GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  958. GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  959. GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  960. GE_CHK_RT_RET(rtMalloc(&copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM));
  961. std::vector<uint64_t> copy_io_addr;
  962. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
  963. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
  964. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
  965. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));
  966. const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size();
  967. GE_CHK_RT_RET(rtMalloc(&copy_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM));
  968. GE_CHK_RT_RET(rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size,
  969. copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE));
  970. return SUCCESS;
  971. }
  972. Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) {
  973. if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
  974. GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
  975. sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size());
  976. REPORT_INNER_ERROR("E19999", "[sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
  977. sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size());
  978. return ACL_ERROR_GE_PARAM_INVALID;
  979. }
  980. GE_CHK_RT_RET(rtMalloc(&copy_workspace_buf_, kernel_def.task_info_size(), RT_MEMORY_HBM));
  981. GE_CHK_RT_RET(rtMemcpy(copy_workspace_buf_, kernel_def.task_info_size(),
  982. kernel_def.task_info().data(), kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE));
  983. STR_FWK_OP_KERNEL aicpu_task = {0};
  984. auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL),
  985. kernel_def.args().data(), kernel_def.args().size());
  986. if (sec_ret != EOK) {
  987. GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][TaskArgs] failed, ret: %d", sec_ret);
  988. REPORT_INNER_ERROR("E19999", "update STR_FWK_OP_KERNEL args failed because memcpy_s return %d.", sec_ret);
  989. return ACL_ERROR_GE_MEMORY_OPERATE_FAILED;
  990. }
  991. aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = reinterpret_cast<uintptr_t>(copy_ioaddr_dev_);
  992. aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = reinterpret_cast<uintptr_t>(copy_workspace_buf_);
  993. aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
  994. aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;
  995. GE_CHK_RT_RET(rtMemcpy(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL),
  996. &aicpu_task, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE));
  997. return SUCCESS;
  998. }
  999. Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
  1000. const std::vector<DataBuffer> &input_buffers,
  1001. std::vector<GeTensorDesc> &output_desc,
  1002. std::vector<DataBuffer> &output_buffers,
  1003. rtStream_t stream) {
  1004. GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
  1005. if (unknown_type_ == DEPEND_COMPUTE) {
  1006. std::vector<DataBuffer> summary_buffers;
  1007. for (size_t i = 0; i < num_outputs_; ++i) {
  1008. summary_buffers.emplace_back(output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary), false);
  1009. }
  1010. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, summary_buffers));
  1011. } else {
  1012. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
  1013. }
  1014. GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  1015. if (unknown_type_ == DEPEND_SHAPE_RANGE) {
  1016. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  1017. GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  1018. } else if (unknown_type_ == DEPEND_COMPUTE) {
  1019. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  1020. GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, output_buffers, stream));
  1021. }
  1022. return SUCCESS;
  1023. }
  1024. Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
  1025. const std::vector<DataBuffer> &input_buffers,
  1026. std::vector<GeTensorDesc> &output_desc,
  1027. std::vector<DataBuffer> &output_buffers,
  1028. rtStream_t stream) {
  1029. GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
  1030. if (unknown_type_ == DEPEND_COMPUTE) {
  1031. std::vector<DataBuffer> summary_buffers;
  1032. for (size_t i = 0; i < num_outputs_; ++i) {
  1033. summary_buffers.emplace_back(output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary), false);
  1034. }
  1035. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, summary_buffers));
  1036. } else {
  1037. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
  1038. }
  1039. GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  1040. if (unknown_type_ == DEPEND_SHAPE_RANGE) {
  1041. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  1042. GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  1043. } else if (unknown_type_ == DEPEND_COMPUTE) {
  1044. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  1045. GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, output_buffers, stream));
  1046. }
  1047. return SUCCESS;
  1048. }
  1049. Status AiCpuCCTask::InitForSummaryAndCopy() {
  1050. if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
  1051. GELOGI("Unknown_type is %d, output num is %zu.", unknown_type_, num_outputs_);
  1052. return SUCCESS;
  1053. }
  1054. output_summary_.resize(num_outputs_);
  1055. constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
  1056. for (size_t i = 0; i < num_outputs_; ++i) {
  1057. GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
  1058. }
  1059. output_summary_host_.resize(num_outputs_);
  1060. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  1061. GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  1062. GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  1063. GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  1064. GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  1065. copy_io_addr_.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
  1066. copy_io_addr_.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
  1067. copy_io_addr_.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
  1068. copy_io_addr_.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));
  1069. return SUCCESS;
  1070. }
  1071. Status AiCpuCCTask::SetMemCopyTask(const domi::KernelDef &kernel_def) {
  1072. auto &memcpy_args = kernel_def.args();
  1073. memcpy_args_size_ = kernel_def.args_size();
  1074. memcpy_so_name_ = kernel_def.so_name();
  1075. memcpy_kernel_name_ = kernel_def.kernel_name();
  1076. if (memcpy_args.size() != memcpy_args_size_) {
  1077. REPORT_INNER_ERROR("E19999", "MemCopy task def args.size=%zu, but args_size=%u not equal.",
  1078. memcpy_args.size(), memcpy_args_size_);
  1079. GELOGE(FAILED, "[Check][Size]MemCopy task def args.size=%zu, but args_size=%u not equal.",
  1080. memcpy_args.size(), memcpy_args_size_);
  1081. return FAILED;
  1082. }
  1083. if (memcpy_args_size_ < sizeof(aicpu::AicpuParamHead)) {
  1084. REPORT_INNER_ERROR("E19999",
  1085. "Task def args_size=%u is less than aicpu param head len=%zu.",
  1086. memcpy_args_size_, sizeof(aicpu::AicpuParamHead));
  1087. GELOGE(FAILED,
  1088. "[Check][Size] Task def args_size=%u is less than aicpu param head len=%zu.",
  1089. memcpy_args_size_, sizeof(aicpu::AicpuParamHead));
  1090. return FAILED;
  1091. }
  1092. memcpy_args_.reset(new(std::nothrow) uint8_t[memcpy_args_size_]());
  1093. if (memcpy_args_ == nullptr) {
  1094. REPORT_INNER_ERROR("E19999", "new memory failed for Node[MemCopy], task_size[%u].",
  1095. memcpy_args_size_);
  1096. GELOGE(FAILED, "[Malloc][Memory] failed for Node[MemCopy], task_size[%u].",
  1097. memcpy_args_size_);
  1098. return FAILED;
  1099. }
  1100. errno_t sec_ret = memcpy_s(memcpy_args_.get(), memcpy_args_size_, memcpy_args.c_str(), memcpy_args.size());
  1101. if (sec_ret != EOK) {
  1102. REPORT_INNER_ERROR("E19999",
  1103. "memcpy_s argc_ failed for Node[MemCopy], ret: %d", sec_ret);
  1104. GELOGE(INTERNAL_ERROR,
  1105. "[Update][args] failed for Node[MemCopy], ret: %d", sec_ret);
  1106. return sec_ret;
  1107. }
  1108. auto memcpy_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(memcpy_args_.get());
  1109. uint32_t memcpy_io_num = memcpy_param_head->ioAddrNum;
  1110. auto memcpy_io_addr = memcpy_args_.get() + sizeof(aicpu::AicpuParamHead);
  1111. // if has input and output, need copy to ioaddr
  1112. int cpy_ret = memcpy_s(memcpy_io_addr, memcpy_args_size_ - sizeof(aicpu::AicpuParamHead),
  1113. &copy_io_addr_[0], sizeof(uint64_t) * memcpy_io_num);
  1114. if (cpy_ret != 0) {
  1115. REPORT_INNER_ERROR("E19999", "Node[Memcpoy] memcpy io addr to AicpuParamHead failed,"
  1116. "ret=%d, args_size=%u, io nums=%u.",
  1117. cpy_ret, memcpy_args_size_, memcpy_io_num);
  1118. GELOGE(INTERNAL_ERROR, "[Update][io_addr]Node[MemCopy] memcpy io addr to AicpuParamHead failed,"
  1119. "ret=%d, args_size=%u, io nums=%u.",
  1120. cpy_ret, memcpy_args_size_, memcpy_io_num);
  1121. return INTERNAL_ERROR;
  1122. }
  1123. GELOGD("Set memcpy task for node[MemCopy] successfully.");
  1124. return SUCCESS;
  1125. }
  1126. Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam &param) {
  1127. // aicpu do not have workspace, for now
  1128. return DoUpdateArgTable(param, false);
  1129. }
  1130. const std::string &AiCpuBaseTask::GetTaskType() const { return kTaskTypeAicpu; }
  1131. void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  1132. arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data());
  1133. arg_count = io_addr_host_.size();
  1134. }
  1135. void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
  1136. args_ = std::move(args);
  1137. arg_size_ = arg_size;
  1138. // The blockdim value is defult "1" for rtCpuKernelLaunch
  1139. block_dim_ = 1;
  1140. }
  1141. void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; }
  1142. void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; }
  1143. void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; }
  1144. const void *AiCpuCCTask::GetArgs() const { return args_.get(); }
  1145. size_t AiCpuCCTask::GetArgSize() const { return arg_size_; }
  1146. AiCpuCCTask::~AiCpuCCTask() {
  1147. }
  1148. Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
  1149. GELOGI("To invoke rtCpuKernelLaunch. block_dim = %u, so_name is %s, kernel_name is %s", block_dim_, so_name_.data(),
  1150. kernel_name_.data());
  1151. // sm_desc is nullptr, because l2 buffer does not support
  1152. auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
  1153. auto ret = rtCpuKernelLaunchWithFlag(static_cast<const void *>(so_name_.data()),
  1154. static_cast<const void *>(kernel_name_.data()),
  1155. block_dim_, args_.get(), static_cast<uint32_t>(arg_size_),
  1156. sm_desc, stream, dump_flag_);
  1157. if (ret != RT_ERROR_NONE) {
  1158. GELOGE(ret, "[Invoke][rtCpuKernelLaunchWithFlag] failed. ret = %d.", ret);
  1159. REPORT_CALL_ERROR("E19999", "invoke rtCpuKernelLaunchWithFlag failed, ret:%d.", ret);
  1160. return RT_ERROR_TO_GE_STATUS(ret);
  1161. }
  1162. GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str());
  1163. GELOGD("Invoke rtCpuKernelLaunch succeeded");
  1164. if (is_blocking_aicpu_op_) {
  1165. if (DistributeWaitTaskForAicpuBlockingOp(stream) != SUCCESS) {
  1166. GELOGE(FAILED, "[Call][DistributeWaitTaskForAicpuBlockingOp] Call DistributeWaitTaskForAicpuBlockingOp failed");
  1167. return FAILED;
  1168. }
  1169. }
  1170. return SUCCESS;
  1171. }
  1172. void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  1173. arg_base = io_addr_;
  1174. arg_count = io_addr_num_;
  1175. }
  1176. Status MemcpyAsyncTask::LaunchKernel(rtStream_t stream) {
  1177. auto src_addr = reinterpret_cast<void *>(addresses_[0]);
  1178. auto dst_addr = reinterpret_cast<void *>(addresses_[1]);
  1179. kind_ = (kind_ == RT_MEMCPY_ADDR_DEVICE_TO_DEVICE) ? RT_MEMCPY_DEVICE_TO_DEVICE : kind_;
  1180. GE_CHK_RT_RET(rtMemcpyAsync(dst_addr, dst_max_, src_addr, count_, kind_, stream));
  1181. return SUCCESS;
  1182. }
  1183. void MemcpyAsyncTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  1184. arg_base = addresses_;
  1185. arg_count = kMemcpyArgCount;
  1186. }
  1187. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示