You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

op_task.cc 48 kB

5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "single_op/task/op_task.h"
  17. #include <google/protobuf/extension_set.h>
  18. #include <chrono>
  19. #include <thread>
  20. #include "aicpu/common/aicpu_task_struct.h"
  21. #include "common/dump/dump_manager.h"
  22. #include "common/dump/dump_op.h"
  23. #include "common/profiling/profiling_manager.h"
  24. #include "common/formats/formats.h"
  25. #include "common/math/math_util.h"
  26. #include "framework/common/debug/log.h"
  27. #include "register/op_tiling.h"
  28. #include "runtime/rt.h"
  29. #include "single_op/task/build_task_utils.h"
  30. namespace ge {
  31. namespace {
  32. constexpr int kLaunchRetryTimes = 1000;
  33. constexpr size_t kMemcpyArgCount = 2;
  34. constexpr int kSleepTime = 10;
  35. constexpr uint64_t kReleaseFlag = 1;
  36. constexpr int kCopyNum = 2;
  37. constexpr uint64_t kInferSessionId = 0;
  38. void FreeHbm(void *var) {
  39. if (var) {
  40. (void)rtFree(var);
  41. }
  42. }
  43. } // namespace
  44. Status OpTask::OpenDump(rtStream_t stream) {
  45. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  46. GELOGI("Dump is open in single op, start to set dump info");
  47. std::vector<uint64_t> input_addrs;
  48. std::vector<uint64_t> output_adds;
  49. auto input_size = op_desc_->GetInputsSize();
  50. auto output_size = op_desc_->GetOutputsSize();
  51. uintptr_t *arg_base = nullptr;
  52. size_t arg_num = 0;
  53. GetIoAddr(arg_base, arg_num);
  54. if (arg_num < input_size + output_size) {
  55. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR,
  56. "[Check][Size]io_addrs_for_dump_ size %zu is not equal input and output size %zu",
  57. arg_num, input_size + output_size);
  58. REPORT_INNER_ERROR("E19999", "io_addrs_for_dump_ size %zu is not equal input and output size %zu",
  59. arg_num, input_size + output_size);
  60. return ACL_ERROR_GE_INTERNAL_ERROR;
  61. }
  62. for (size_t i = 0; i < input_size; i++) {
  63. uint64_t input_addr = arg_base[i];
  64. input_addrs.emplace_back(input_addr);
  65. }
  66. for (size_t j = 0; j < output_size; j++) {
  67. uint64_t output_addr = arg_base[input_size + j];
  68. output_adds.emplace_back(output_addr);
  69. }
  70. dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(kInferSessionId),
  71. op_desc_, input_addrs, output_adds, stream);
  72. auto status = dump_op_.LaunchDumpOp();
  73. if (status != SUCCESS) {
  74. GELOGE(status, "[Launch][DumpOp] failed in single op.");
  75. return status;
  76. }
  77. return SUCCESS;
  78. }
  79. GELOGI("Dump is not open in single op");
  80. return SUCCESS;
  81. }
  82. void TbeOpTask::SetStubFunc(const std::string &name, const void *stub_func) {
  83. this->stub_name_ = name;
  84. this->stub_func_ = stub_func;
  85. this->task_name_ = name;
  86. }
  87. void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim,
  88. const OpDescPtr &op_desc) {
  89. args_ = std::move(args);
  90. arg_size_ = arg_size;
  91. block_dim_ = block_dim;
  92. op_desc_ = op_desc;
  93. }
  94. void TbeOpTask::SetKernelWithHandleArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim,
  95. const OpDescPtr &op_desc,
  96. const domi::KernelDefWithHandle &kernel_def_with_handle) {
  97. SetKernelArgs(std::move(args), arg_size, block_dim, op_desc);
  98. original_kernel_key_ = kernel_def_with_handle.original_kernel_key();
  99. node_info_ = kernel_def_with_handle.node_info();
  100. }
  101. void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; }
  102. void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) {
  103. model_name_ = model_name;
  104. model_id_ = model_id;
  105. }
  106. Status OpTask::GetProfilingArgs(TaskDescInfo &task_desc_info, uint32_t &model_id) {
  107. uint32_t task_id = 0;
  108. uint32_t stream_id = 0;
  109. auto rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
  110. if (rt_ret != RT_ERROR_NONE) {
  111. GELOGE(RT_FAILED, "[Get][TaskIdAndStreamID] failed, ret: 0x%X.", rt_ret);
  112. REPORT_CALL_ERROR("E19999", "rtGetTaskIdAndStreamID failed, ret: 0x%X.", rt_ret);
  113. return RT_ERROR_TO_GE_STATUS(rt_ret);
  114. }
  115. GE_CHECK_NOTNULL(op_desc_);
  116. string op_name = op_desc_->GetName();
  117. GELOGD("Get profiling args of op [%s] end, task_id[%u], stream_id[%u].", op_name.c_str(), task_id, stream_id);
  118. model_id = model_id_;
  119. task_desc_info.model_name = model_name_;
  120. task_desc_info.block_dim = block_dim_;
  121. task_desc_info.task_id = task_id;
  122. task_desc_info.stream_id = stream_id;
  123. task_desc_info.op_name = op_name;
  124. task_desc_info.op_type = op_desc_->GetType();
  125. auto &prof_mgr = ProfilingManager::Instance();
  126. prof_mgr.GetOpInputOutputInfo(op_desc_, task_desc_info);
  127. return SUCCESS;
  128. }
  129. Status OpTask::UpdateRunInfo() {
  130. return UNSUPPORTED;
  131. }
  132. Status OpTask::DoUpdateArgTable(const SingleOpModelParam &param, bool keep_workspace) {
  133. auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, keep_workspace);
  134. auto all_addresses = BuildTaskUtils::JoinAddresses(addresses);
  135. uintptr_t *arg_base = nullptr;
  136. size_t arg_num = 0;
  137. GetIoAddr(arg_base, arg_num);
  138. if (arg_num < all_addresses.size()) {
  139. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR,
  140. "[Check][Size][%s] arg number mismatches, expect at least = %zu, but got = %zu.",
  141. op_desc_->GetName().c_str(), all_addresses.size(), arg_num);
  142. REPORT_INNER_ERROR("E19999", "%s arg number mismatches, expect at least = %zu, but got = %zu.",
  143. op_desc_->GetName().c_str(), all_addresses.size(), arg_num);
  144. return ACL_ERROR_GE_INTERNAL_ERROR;
  145. }
  146. for (void *addr : all_addresses) {
  147. *arg_base++ = reinterpret_cast<uintptr_t >(addr);
  148. }
  149. return SUCCESS;
  150. }
  151. Status OpTask::UpdateArgTable(const SingleOpModelParam &param) {
  152. return DoUpdateArgTable(param, true);
  153. }
  154. Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
  155. const vector<DataBuffer> &input_buffers,
  156. vector<GeTensorDesc> &output_desc,
  157. vector<DataBuffer> &output_buffers,
  158. rtStream_t stream) {
  159. return UNSUPPORTED;
  160. }
  161. const std::string &OpTask::GetTaskType() const { return kTaskTypeInvalid; }
  162. TbeOpTask::~TbeOpTask() {
  163. if (sm_desc_ != nullptr) {
  164. (void)rtMemFreeManaged(sm_desc_);
  165. }
  166. if (tiling_buffer_ != nullptr) {
  167. (void)rtFree(tiling_buffer_);
  168. }
  169. }
  170. const void *TbeOpTask::GetArgs() const { return args_.get(); }
  171. size_t TbeOpTask::GetArgSize() const { return arg_size_; }
  172. const std::string &TbeOpTask::GetStubName() const { return stub_name_; }
  173. const std::string &TbeOpTask::GetTaskType() const { return kTaskTypeAicore; }
  174. void TbeOpTask::SetHandle(void *handle) {
  175. this->handle_ = handle;
  176. }
  177. Status TbeOpTask::LaunchKernel(rtStream_t stream) {
  178. GELOGD("To invoke rtKernelLaunch. task = %s, block_dim = %u", this->stub_name_.c_str(), block_dim_);
  179. auto ret = DoLaunchKernel(stream);
  180. int retry_times = 0;
  181. while (ret != RT_ERROR_NONE && retry_times < kLaunchRetryTimes) {
  182. retry_times++;
  183. GELOGW("Retry after %d ms, retry_times: %d", kSleepTime, retry_times);
  184. std::this_thread::sleep_for(std::chrono::milliseconds(kSleepTime));
  185. ret = DoLaunchKernel(stream);
  186. }
  187. if (ret != RT_ERROR_NONE) {
  188. GELOGE(ret, "[Invoke][RtKernelLaunch] failed. ret = %d, task = %s", ret, this->stub_name_.c_str());
  189. REPORT_INNER_ERROR("E19999", "invoke rtKernelLaunch failed, ret = %d, task = %s", ret, this->stub_name_.c_str());
  190. return RT_ERROR_TO_GE_STATUS(ret);
  191. }
  192. GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());
  193. return SUCCESS;
  194. }
  195. Status TbeOpTask::UpdateRunInfo() {
  196. // invoke OpParaCalculate
  197. GELOGD("Start to invoke OpParaCalculate.");
  198. optiling::utils::OpRunInfo run_info(0, true, 0);
  199. auto ret = optiling::OpParaCalculateV2(*node_, run_info);
  200. if (ret != GRAPH_SUCCESS) {
  201. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Invoke][OpParaCalculate] failed, ret = %u.", ret);
  202. REPORT_INNER_ERROR("E19999", "invoke OpParaCalculate failed, ret = %u.", ret);
  203. return ACL_ERROR_GE_INTERNAL_ERROR;
  204. }
  205. block_dim_ = run_info.GetBlockDim();
  206. tiling_data_ = run_info.GetAllTilingData().str();
  207. tiling_key_ = run_info.GetTilingKey();
  208. run_info.GetAllWorkspaces(run_info_workspaces_);
  209. GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu, tiling_key = %u", block_dim_,
  210. tiling_data_.size(), tiling_key_);
  211. return SUCCESS;
  212. }
  213. Status TbeOpTask::UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor) {
  214. int64_t storage_format_val = static_cast<Format>(FORMAT_RESERVED);
  215. (void)AttrUtils::GetInt(src_tensor, ge::ATTR_NAME_STORAGE_FORMAT, storage_format_val);
  216. auto storage_format = static_cast<Format>(storage_format_val);
  217. if (storage_format == FORMAT_RESERVED) {
  218. GELOGD("Storage format not set. update shape to [%s], and original shape to [%s]",
  219. src_tensor.GetShape().ToString().c_str(), src_tensor.GetOriginShape().ToString().c_str());
  220. dst_tensor.SetShape(src_tensor.GetShape());
  221. dst_tensor.SetOriginShape(src_tensor.GetOriginShape());
  222. } else {
  223. std::vector<int64_t> storage_shape;
  224. if (!AttrUtils::GetListInt(src_tensor, ge::ATTR_NAME_STORAGE_SHAPE, storage_shape)) {
  225. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][ListInt]failed while storage_format was set.");
  226. return ACL_ERROR_GE_INTERNAL_ERROR;
  227. }
  228. GELOGD("Storage format set. update shape to [%s], and original shape to [%s]",
  229. GeShape(storage_shape).ToString().c_str(), src_tensor.GetShape().ToString().c_str());
  230. dst_tensor.SetShape(GeShape(std::move(storage_shape)));
  231. dst_tensor.SetOriginShape(src_tensor.GetShape());
  232. }
  233. return SUCCESS;
  234. }
  235. Status TbeOpTask::UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
  236. auto op_desc = node_->GetOpDesc();
  237. GE_CHECK_NOTNULL(op_desc);
  238. // Set runtime shape to node
  239. for (size_t i = 0; i < input_desc.size(); ++i) {
  240. auto tensor_desc = op_desc->MutableInputDesc(i);
  241. auto &runtime_tensor_desc = input_desc[i];
  242. GE_CHECK_NOTNULL(tensor_desc);
  243. GE_CHK_STATUS_RET(UpdateTensorDesc(runtime_tensor_desc, *tensor_desc));
  244. }
  245. for (size_t i = 0; i < output_desc.size(); ++i) {
  246. auto tensor_desc = op_desc->MutableOutputDesc(i);
  247. auto &runtime_tensor_desc = output_desc[i];
  248. GE_CHECK_NOTNULL(tensor_desc);
  249. GE_CHK_STATUS_RET(UpdateTensorDesc(runtime_tensor_desc, *tensor_desc));
  250. }
  251. return SUCCESS;
  252. }
  253. Status TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, uint32_t max_tiling_size) {
  254. if (tiling_buffer != nullptr) {
  255. uintptr_t *arg_base = nullptr;
  256. size_t arg_num = 0;
  257. GetIoAddr(arg_base, arg_num);
  258. GE_CHECK_NOTNULL(node);
  259. GE_CHECK_NOTNULL(node->GetOpDesc());
  260. uint32_t inputs_num = node->GetOpDesc()->GetInputsSize();
  261. uint32_t outputs_num = node->GetOpDesc()->GetOutputsSize();
  262. uint32_t workspace_nums = node->GetOpDesc()->GetWorkspace().size();
  263. uint32_t tiling_index = inputs_num + outputs_num + workspace_nums;
  264. if (arg_num == 0 || arg_num < tiling_index) {
  265. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Size]Tiling index %u, arg number %zu is invalid.",
  266. tiling_index, arg_num);
  267. return ACL_ERROR_GE_INTERNAL_ERROR;
  268. }
  269. arg_base[tiling_index] = reinterpret_cast<uintptr_t>(tiling_buffer);
  270. }
  271. node_ = node;
  272. tiling_buffer_ = tiling_buffer;
  273. max_tiling_size_ = max_tiling_size;
  274. return SUCCESS;
  275. }
  276. Status TbeOpTask::AllocateWorkspaces(const vector<int64_t> &workspace_sizes) {
  277. static const std::string kPurpose("malloc workspace memory for dynamic op.");
  278. workspaces_.clear();
  279. if (workspace_sizes.empty()) {
  280. GELOGD("No need to allocate workspace.");
  281. return SUCCESS;
  282. }
  283. int64_t total_size = 0;
  284. std::vector<int64_t> ws_offsets;
  285. for (auto ws_size : workspace_sizes) {
  286. // alignment and padding should be done in OpParaCalculate
  287. if (CheckInt64AddOverflow(total_size, ws_size) != SUCCESS) {
  288. return ACL_ERROR_GE_INTERNAL_ERROR;
  289. }
  290. ws_offsets.emplace_back(total_size);
  291. total_size += ws_size;
  292. }
  293. GELOGD("Total workspace size is %ld", total_size);
  294. GE_CHECK_NOTNULL(stream_resource_);
  295. auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast<size_t>(total_size));
  296. if (ws_base == nullptr) {
  297. GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Malloc][Memory] failed, size: %ld", total_size);
  298. REPORT_INNER_ERROR("E19999", "MallocMemory failed, size: %ld", total_size);
  299. return ACL_ERROR_GE_MEMORY_ALLOCATION;
  300. }
  301. GELOGD("Done allocating workspace memory successfully.");
  302. for (auto ws_offset : ws_offsets) {
  303. workspaces_.emplace_back(ws_base + ws_offset);
  304. }
  305. return SUCCESS;
  306. }
  307. Status TbeOpTask::UpdateTilingArgs(rtStream_t stream) {
  308. size_t args_size = input_num_ + output_num_ + workspaces_.size();
  309. if (tiling_buffer_ != nullptr) {
  310. args_size++;
  311. }
  312. size_t temp_size = args_size * sizeof(void *);
  313. if (arg_size_ < temp_size) {
  314. GELOGD("Need to reset size of args_ from %zu to %zu.", arg_size_, temp_size);
  315. std::unique_ptr<uint8_t[]> args(new (std::nothrow) uint8_t[temp_size]());
  316. GE_CHECK_NOTNULL(args);
  317. if (memcpy_s(args.get(), temp_size, args_.get(), arg_size_) != EOK) {
  318. GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][KernelArgs] failed for [%s].", node_->GetName().c_str());
  319. REPORT_INNER_ERROR("E19999", "update kernel args failed for %s.", node_->GetName().c_str());
  320. return ACL_ERROR_GE_MEMORY_OPERATE_FAILED;
  321. }
  322. args_ = std::move(args);
  323. arg_size_ = temp_size;
  324. }
  325. uintptr_t *arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  326. size_t arg_index = input_num_ + output_num_;
  327. for (size_t i = 0; i < workspaces_.size(); ++i) {
  328. arg_base[arg_index++] = reinterpret_cast<uintptr_t>(workspaces_[i]);
  329. }
  330. if (tiling_buffer_ != nullptr) {
  331. GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size());
  332. GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_, max_tiling_size_, tiling_data_.data(), tiling_data_.size(),
  333. RT_MEMCPY_HOST_TO_DEVICE_EX, stream));
  334. arg_base[arg_index] = reinterpret_cast<uintptr_t>(tiling_buffer_);
  335. }
  336. return SUCCESS;
  337. }
  338. Status TbeOpTask::SetArgIndex() {
  339. const vector<bool> v_is_input_const = op_desc_->GetIsInputConst();
  340. size_t input_index = 0;
  341. for (size_t i = 0; i < op_desc_->GetAllInputsSize(); ++i) {
  342. const GeTensorDescPtr tensor_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(i));
  343. if (tensor_desc == nullptr) {
  344. GELOGD("SingleOp: %s, Index: %zu, has no input", op_desc_->GetName().c_str(), i);
  345. continue;
  346. }
  347. if (i < v_is_input_const.size() && v_is_input_const[i]) {
  348. GELOGD("SingleOp: %s, Index: %zu, input is const", op_desc_->GetName().c_str(), i);
  349. input_index++;
  350. continue;
  351. }
  352. arg_index_.emplace_back(input_index);
  353. input_index++;
  354. }
  355. return SUCCESS;
  356. }
  357. Status TbeOpTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) {
  358. if (arg_index_.size() != inputs.size()) {
  359. GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size] Args size is %zu, but get input size is %zu.",
  360. arg_index_.size(), inputs.size());
  361. REPORT_INNER_ERROR("E19999", "[Check][Size] Args size is %zu, but get input size is %zu.",
  362. arg_index_.size(), inputs.size());
  363. return ACL_ERROR_GE_PARAM_INVALID;
  364. }
  365. uintptr_t *arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  366. for (size_t i = 0; i < arg_index_.size(); ++i) {
  367. arg_base[arg_index_[i]] = reinterpret_cast<uintptr_t>(inputs[i].data);
  368. }
  369. for (size_t i = 0; i < op_desc_->GetOutputsSize(); ++i) {
  370. arg_base[input_num_ + i] = reinterpret_cast<uintptr_t>(outputs[i].data);
  371. }
  372. return SUCCESS;
  373. }
  374. Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
  375. const vector<DataBuffer> &input_buffers,
  376. vector<GeTensorDesc> &output_desc,
  377. vector<DataBuffer> &output_buffers,
  378. rtStream_t stream) {
  379. GELOGD("[%s] Start to launch kernel", node_->GetName().c_str());
  380. GE_CHK_STATUS_RET(UpdateIoAddr(input_buffers, output_buffers), "[Update][IoAddr] failed.");
  381. GE_CHK_STATUS_RET_NOLOG(UpdateNodeByShape(input_desc, output_desc));
  382. GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo());
  383. GE_CHK_STATUS_RET(AllocateWorkspaces(run_info_workspaces_), "[Allocate][Workspaces] failed.");
  384. GE_CHK_STATUS_RET(UpdateTilingArgs(stream), "[Update][TilingArgs] failed.");
  385. GELOGD("[%s] Start to invoke rtKernelLaunch", node_->GetName().c_str());
  386. GE_CHK_STATUS_RET(DoLaunchKernel(stream), "Failed to do launch kernel.");
  387. return SUCCESS;
  388. }
  389. Status TbeOpTask::DoLaunchKernel(rtStream_t stream) {
  390. auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
  391. if (handle_ == nullptr) {
  392. GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), static_cast<uint32_t>(arg_size_),
  393. sm_desc, stream));
  394. } else {
  395. std::string dev_func = original_kernel_key_ + "_" + std::to_string(tiling_key_);
  396. std::string kernel_info = node_info_ + "/" + std::to_string(tiling_key_);
  397. GE_CHK_RT_RET(rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, args_.get(),
  398. static_cast<uint32_t>(arg_size_), sm_desc, stream, kernel_info.c_str()));
  399. }
  400. return SUCCESS;
  401. }
  402. void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  403. arg_base = reinterpret_cast<uintptr_t *>(args_.get());
  404. arg_count = arg_size_ / sizeof(void *);
  405. if (tiling_buffer_ != nullptr) {
  406. --arg_count;
  407. }
  408. }
  409. AiCpuBaseTask::~AiCpuBaseTask() {
  410. if (ext_info_addr_dev_ != nullptr) {
  411. (void)rtFree(ext_info_addr_dev_);
  412. }
  413. FreeHbm(copy_ioaddr_dev_);
  414. FreeHbm(copy_input_release_flag_dev_);
  415. FreeHbm(copy_input_data_size_dev_);
  416. FreeHbm(copy_input_src_dev_);
  417. FreeHbm(copy_input_dst_dev_);
  418. for (auto summary : output_summary_) {
  419. FreeHbm(summary);
  420. }
  421. for (auto out_shape : out_shape_hbm_) {
  422. FreeHbm(out_shape);
  423. }
  424. }
  425. Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id) {
  426. if (kernel_ext_info.empty()) {
  427. GELOGI("Kernel_ext_info is empty, no need copy to device.");
  428. return SUCCESS;
  429. }
  430. int32_t unknown_shape_type_val = 0;
  431. (void) AttrUtils::GetInt(op_desc_, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
  432. GELOGD("Get unknown_type is %d.", unknown_shape_type_val);
  433. unknown_type_ = static_cast<UnknowShapeOpType>(unknown_shape_type_val);
  434. aicpu_ext_handle_.reset(new(std::nothrow) ::ge::hybrid::AicpuExtInfoHandler(op_desc_->GetName(),
  435. num_inputs_,
  436. num_outputs_,
  437. unknown_type_));
  438. GE_CHK_BOOL_RET_STATUS(aicpu_ext_handle_ != nullptr, ACL_ERROR_GE_MEMORY_ALLOCATION,
  439. "[Malloc][Memory] failed for aicpu_ext_handle!");
  440. Status ret = aicpu_ext_handle_->Parse(kernel_ext_info);
  441. if (ret != SUCCESS) {
  442. GELOGE(ret, "[Parse][Param:kernel_ext_info] failed, kernel_ext_info_size=%zu.", kernel_ext_info.size());
  443. REPORT_INNER_ERROR("E19999",
  444. "Parse Param:kernel_ext_info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size());
  445. return ret;
  446. }
  447. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateSessionInfo(ULLONG_MAX, kernel_id, false),
  448. "[Update][SessionInfo] failed.");
  449. GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(), RT_MEMORY_HBM));
  450. GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(),
  451. aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(),
  452. RT_MEMCPY_HOST_TO_DEVICE));
  453. return SUCCESS;
  454. }
  455. Status AiCpuBaseTask::SetInputConst() {
  456. input_is_const_.clear();
  457. const vector<bool> v_is_input_const = op_desc_->GetIsInputConst();
  458. for (size_t i = 0; i < op_desc_->GetAllInputsSize(); ++i) {
  459. const GeTensorDescPtr tensor_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(i));
  460. if (tensor_desc == nullptr) {
  461. GELOGD("SingleOp: %s, Index: %zu, has no input", op_desc_->GetName().c_str(), i);
  462. continue;
  463. }
  464. if (i < v_is_input_const.size() && v_is_input_const[i]) {
  465. GELOGD("SingleOp: %s, Index: %zu, input is const", op_desc_->GetName().c_str(), i);
  466. input_is_const_.push_back(true);
  467. continue;
  468. }
  469. input_is_const_.push_back(false);
  470. }
  471. return SUCCESS;
  472. }
  473. Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
  474. std::vector<GeTensorDesc> &output_desc,
  475. rtStream_t stream) {
  476. GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_);
  477. GE_CHECK_NOTNULL(aicpu_ext_handle_);
  478. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(false), "[Update][ExecuteMode] failed.");
  479. if (num_inputs_ == 0 && num_outputs_ == 0) {
  480. GELOGI("No input and output, no need update ext info.");
  481. return SUCCESS;
  482. }
  483. size_t non_const_index = 0;
  484. for (size_t input_index = 0; input_index < num_inputs_; input_index++) {
  485. if (input_index < input_is_const_.size() && input_is_const_[input_index]) {
  486. // get input_desc from op_desc if const input, num_inputs_ is op_desc_ input_size
  487. auto const_input_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(input_index));
  488. GE_CHECK_NOTNULL(const_input_desc);
  489. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, *const_input_desc),
  490. "[Update][InputShapeAndType] failed, input_index:%zu.", input_index);
  491. continue;
  492. }
  493. GE_CHK_BOOL_RET_STATUS(non_const_index < input_desc.size(), ACL_ERROR_GE_PARAM_INVALID,
  494. "[Check][Size]Input_desc size is %zu, but get non_const_index is %zu", input_desc.size(), non_const_index);
  495. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, input_desc[non_const_index]),
  496. "[Update][InputShapeAndType]failed, input_index:%zu.", input_index);
  497. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  498. GE_CHK_STATUS_RET(op_desc_->UpdateInputDesc(input_index, input_desc[non_const_index]),
  499. "AiCpuTask Update [%zu]th input desc failed.",input_index);
  500. }
  501. non_const_index++;
  502. }
  503. if (unknown_type_ != DEPEND_COMPUTE) {
  504. for (size_t j = 0; j < num_outputs_; ++j) {
  505. GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]),
  506. "[Update][OutputShapeAndType] failed, Output:%zu.", j);
  507. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  508. GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(j, output_desc[j]),
  509. "AiCpuTask Update [%zu]th output desc failed.",j);
  510. }
  511. }
  512. }
  513. GE_CHK_RT_RET(rtMemcpyAsync(ext_info_addr_dev_,
  514. aicpu_ext_handle_->GetExtInfoLen(), // check size
  515. aicpu_ext_handle_->GetExtInfo(),
  516. aicpu_ext_handle_->GetExtInfoLen(),
  517. RT_MEMCPY_HOST_TO_DEVICE_EX,
  518. stream));
  519. GELOGI("Update ext info end.");
  520. return SUCCESS;
  521. }
  522. Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) {
  523. if (num_outputs_ == 0) {
  524. GELOGD("AiCpuBaseTask output_num is 0, no need update output shape.");
  525. return SUCCESS;
  526. }
  527. GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape.");
  528. GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), ext_info_addr_dev_,
  529. aicpu_ext_handle_->GetExtInfoLen(), RT_MEMCPY_DEVICE_TO_HOST));
  530. for (size_t i = 0; i < num_outputs_; ++i) {
  531. GeShape shape;
  532. DataType data_type;
  533. aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type);
  534. GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]),
  535. "[Update][ShapeToOutputDesc] failed, output:%zu.", i);
  536. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  537. GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "[Update][OutputDesc] failed, output:%zu.", i);
  538. }
  539. }
  540. GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished.");
  541. return SUCCESS;
  542. }
  543. Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc) {
  544. auto shape_old = output_desc.GetShape();
  545. output_desc.SetShape(shape_new);
  546. GELOGD("Update AiCpuBaseTask shape from %s to %s", shape_old.ToString().c_str(), shape_new.ToString().c_str());
  547. auto origin_shape_old = output_desc.GetOriginShape();
  548. auto origin_format = output_desc.GetOriginFormat();
  549. auto format = output_desc.GetFormat();
  550. if (origin_format == format) {
  551. output_desc.SetOriginShape(shape_new);
  552. return SUCCESS;
  553. }
  554. std::vector<int64_t> origin_dims_new;
  555. auto trans_ret = formats::TransShape(format, shape_new.GetDims(),
  556. output_desc.GetDataType(), origin_format, origin_dims_new);
  557. GE_CHK_STATUS_RET(trans_ret,
  558. "[Trans][Shape] failed, AiCpuTask originFormat[%d] is not same as format[%d], shape=%s.",
  559. origin_format, format, shape_new.ToString().c_str());
  560. auto origin_shape_new = GeShape(origin_dims_new);
  561. output_desc.SetOriginShape(origin_shape_new);
  562. GELOGD("AiCpuTask originFormat[%d] is not same as format[%d], need update from %s ro %s.",
  563. origin_format, format, origin_shape_old.ToString().c_str(), origin_shape_new.ToString().c_str());
  564. return SUCCESS;
  565. }
  566. Status AiCpuBaseTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) {
  567. uintptr_t *arg_base = nullptr;
  568. size_t arg_num = 0;
  569. GetIoAddr(arg_base, arg_num);
  570. // input number and output number was check in ValidateParams
  571. size_t non_const_index = 0;
  572. for (size_t input_index = 0; input_index < num_inputs_; input_index++) {
  573. if (input_index < input_is_const_.size() && input_is_const_[input_index]) {
  574. // const input no need update addr
  575. GE_CHECK_NOTNULL(arg_base);
  576. GELOGD("AICpuTask input[%zu] addr = %lu", input_index, *arg_base);
  577. arg_base++;
  578. continue;
  579. }
  580. GE_CHK_BOOL_RET_STATUS(non_const_index < inputs.size(), ACL_ERROR_GE_PARAM_INVALID,
  581. "[Check][Size] Input size is %zu, but get non_const_index is %zu", inputs.size(), non_const_index);
  582. auto addr = inputs[non_const_index].data;
  583. GE_CHECK_NOTNULL(addr);
  584. GELOGD("AICpuTask input[%zu] addr = %p", input_index, addr);
  585. *arg_base++ = reinterpret_cast<uintptr_t>(addr);
  586. non_const_index++;
  587. }
  588. for (size_t i = 0; i < outputs.size(); ++i) {
  589. auto addr = outputs[i].data;
  590. GE_CHECK_NOTNULL(addr);
  591. GELOGD("AICpuTask output[%zu] addr = %p", i, addr);
  592. *arg_base++ = reinterpret_cast<uintptr_t>(addr);
  593. }
  594. return SUCCESS;
  595. }
  596. AiCpuTask::~AiCpuTask() {
  597. FreeHbm(args_);
  598. FreeHbm(io_addr_);
  599. FreeHbm(workspace_addr_);
  600. FreeHbm(copy_workspace_buf_);
  601. FreeHbm(copy_task_args_buf_);
  602. }
  603. Status AiCpuTask::LaunchKernel(rtStream_t stream) {
  604. GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str());
  605. auto ret = rtMemcpyAsync(io_addr_,
  606. io_addr_size_,
  607. io_addr_host_.data(),
  608. io_addr_host_.size() * sizeof(void *),
  609. RT_MEMCPY_HOST_TO_DEVICE_EX,
  610. stream);
  611. if (ret != RT_ERROR_NONE) {
  612. GELOGE(ret, "[MemcpyAsync][Date] failed. ret = %d, task = %s", ret, this->op_type_.c_str());
  613. REPORT_CALL_ERROR("E19999", "rtMemcpyAsync data failed, ret = %d, task = %s", ret, this->op_type_.c_str());
  614. return RT_ERROR_TO_GE_STATUS(ret);
  615. }
  616. GELOGI("To invoke rtKernelLaunchEx. task = %s", this->op_type_.c_str());
  617. ret = rtKernelLaunchEx(args_, arg_size_, 0, stream);
  618. if (ret != RT_ERROR_NONE) {
  619. GELOGE(ret, "[Invoke][rtKernelLaunch] failed. ret = %d, task = %s", ret, this->op_type_.c_str());
  620. REPORT_CALL_ERROR("E19999", "invoke rtKernelLaunchEx failed, ret = %d, task = %s", ret, this->op_type_.c_str());
  621. return RT_ERROR_TO_GE_STATUS(ret);
  622. }
  623. GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str());
  624. GELOGD("Done launch kernel successfully. task = %s", this->op_type_.c_str());
  625. return SUCCESS;
  626. }
  627. Status AiCpuBaseTask::PrepareCopyInputs(vector<DataBuffer> &outputs) {
  628. std::vector<uint64_t> copy_input_release_flag;
  629. std::vector<uint64_t> copy_input_data_size;
  630. std::vector<uint64_t> copy_input_src;
  631. std::vector<uint64_t> copy_input_dst;
  632. for (size_t i = 0; i < num_outputs_; ++i) {
  633. const auto &summary = output_summary_host_[i];
  634. GELOGI("Node out[%zu] summary, shape data=0x%lx, shape data size=%lu, raw data=0x%lx, raw data size=%lu.",
  635. i, summary.shape_data_ptr, summary.shape_data_size,
  636. summary.raw_data_ptr, summary.raw_data_size);
  637. auto output = outputs[i];
  638. copy_input_release_flag.emplace_back(kReleaseFlag);
  639. if (summary.raw_data_size > 0) {
  640. copy_input_data_size.emplace_back(output.length);
  641. } else {
  642. copy_input_data_size.emplace_back(summary.raw_data_size);
  643. }
  644. copy_input_src.emplace_back(summary.raw_data_ptr);
  645. copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output.data));
  646. const auto &shape_buffer = out_shape_hbm_[i];
  647. copy_input_release_flag.emplace_back(kReleaseFlag);
  648. copy_input_data_size.emplace_back(summary.shape_data_size);
  649. copy_input_src.emplace_back(summary.shape_data_ptr);
  650. copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(shape_buffer));
  651. }
  652. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  653. GE_CHK_RT_RET(rtMemcpy(copy_input_release_flag_dev_, copy_input_buf_len,
  654. copy_input_release_flag.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  655. GE_CHK_RT_RET(rtMemcpy(copy_input_data_size_dev_, copy_input_buf_len,
  656. copy_input_data_size.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  657. GE_CHK_RT_RET(rtMemcpy(copy_input_src_dev_, copy_input_buf_len,
  658. copy_input_src.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  659. GE_CHK_RT_RET(rtMemcpy(copy_input_dst_dev_, copy_input_buf_len,
  660. copy_input_dst.data(), copy_input_buf_len, RT_MEMCPY_HOST_TO_DEVICE));
  661. return SUCCESS;
  662. }
  663. Status AiCpuBaseTask::ReadResultSummaryAndPrepareMemory() {
  664. for (size_t i = 0; i < num_outputs_; ++i) {
  665. auto &result_summary = output_summary_host_[i];
  666. GE_CHK_RT_RET(rtMemcpy(&result_summary, sizeof(aicpu::FWKAdapter::ResultSummary),
  667. output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary),
  668. RT_MEMCPY_DEVICE_TO_HOST));
  669. auto shape_data_size = result_summary.shape_data_size;
  670. void *shape_buffer = nullptr;
  671. if (shape_data_size > 0) {
  672. GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM));
  673. }
  674. out_shape_hbm_.emplace_back(shape_buffer);
  675. }
  676. return SUCCESS;
  677. }
  678. Status AiCpuTask::CopyDataToHbm(vector<DataBuffer> &outputs,
  679. rtStream_t stream) {
  680. GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs));
  681. GE_CHK_RT_RET(rtKernelLaunchEx(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL),
  682. RT_KERNEL_DEFAULT, stream));
  683. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  684. return SUCCESS;
  685. }
  686. Status AiCpuCCTask::CopyDataToHbm(vector<DataBuffer> &outputs,
  687. rtStream_t stream) {
  688. GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs));
  689. auto ret = rtCpuKernelLaunchWithFlag(static_cast<const void *>(memcpy_so_name_.data()),
  690. static_cast<const void *>(memcpy_kernel_name_.data()),
  691. block_dim_, memcpy_args_.get(), static_cast<uint32_t>(memcpy_args_size_),
  692. nullptr, stream, dump_flag_);
  693. GE_CHK_RT_RET(ret);
  694. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  695. return SUCCESS;
  696. }
  697. Status AiCpuBaseTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) {
  698. for (size_t i = 0; i < num_outputs_; ++i) {
  699. const auto &result_summary = output_summary_host_[i];
  700. std::vector<int64_t> shape_dims;
  701. if (result_summary.shape_data_size > 0) {
  702. const auto &shape_hbm = out_shape_hbm_[i];
  703. uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
  704. std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
  705. GE_CHECK_NOTNULL(shape_addr);
  706. GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm,
  707. result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));
  708. for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
  709. shape_dims.emplace_back(shape_addr[dim_idx]);
  710. GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]);
  711. }
  712. }
  713. GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
  714. "[Update][ShapeToOutputDesc] failed , output:%zu.", i);
  715. if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
  716. GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "[Update][OutputDesc] failed, output:%zu.", i);
  717. }
  718. }
  719. return SUCCESS;
  720. }
  721. Status AiCpuBaseTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
  722. vector<DataBuffer> &outputs,
  723. rtStream_t stream) {
  724. if (num_outputs_ == 0) {
  725. GELOGI("Output num is 0, there is no need to update the output and size.");
  726. return SUCCESS;
  727. }
  728. GELOGI("Update shape and data by result summary begin.");
  729. for (auto out_shape : out_shape_hbm_) {
  730. FreeHbm(out_shape);
  731. }
  732. out_shape_hbm_.clear();
  733. GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(),
  734. "[Read][ResultSummaryAndPrepareMemory] failed.");
  735. GE_CHK_STATUS_RET(CopyDataToHbm(outputs, stream),
  736. "[Copy][DataToHbm] failed.");
  737. GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc),
  738. "[Update][ShapeByHbmBuffer] failed.");
  739. for (auto out_shape : out_shape_hbm_) {
  740. FreeHbm(out_shape);
  741. }
  742. out_shape_hbm_.clear();
  743. GELOGI("Update shape and data by result summary end.");
  744. return SUCCESS;
  745. }
  746. Status AiCpuTask::InitForSummaryAndCopy() {
  747. if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
  748. GELOGI("Unknown_type is %d, output num is %zu.", unknown_type_, num_outputs_);
  749. return SUCCESS;
  750. }
  751. output_summary_.resize(num_outputs_);
  752. constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
  753. for (size_t i = 0; i < num_outputs_; ++i) {
  754. GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
  755. }
  756. output_summary_host_.resize(num_outputs_);
  757. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  758. GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  759. GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  760. GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  761. GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  762. GE_CHK_RT_RET(rtMalloc(&copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM));
  763. std::vector<uint64_t> copy_io_addr;
  764. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
  765. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
  766. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
  767. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));
  768. const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size();
  769. GE_CHK_RT_RET(rtMalloc(&copy_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM));
  770. GE_CHK_RT_RET(rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size,
  771. copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE));
  772. return SUCCESS;
  773. }
  774. Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) {
  775. if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
  776. GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
  777. sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size());
  778. REPORT_INNER_ERROR("E19999", "[sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
  779. sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size());
  780. return ACL_ERROR_GE_PARAM_INVALID;
  781. }
  782. GE_CHK_RT_RET(rtMalloc(&copy_workspace_buf_, kernel_def.task_info_size(), RT_MEMORY_HBM));
  783. GE_CHK_RT_RET(rtMemcpy(copy_workspace_buf_, kernel_def.task_info_size(),
  784. kernel_def.task_info().data(), kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE));
  785. STR_FWK_OP_KERNEL aicpu_task = {0};
  786. auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL),
  787. kernel_def.args().data(), kernel_def.args().size());
  788. if (sec_ret != EOK) {
  789. GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][TaskArgs] failed, ret: %d", sec_ret);
  790. REPORT_INNER_ERROR("E19999", "update STR_FWK_OP_KERNEL args failed because memcpy_s return %d.", sec_ret);
  791. return ACL_ERROR_GE_MEMORY_OPERATE_FAILED;
  792. }
  793. aicpu_task.fwkKernelBase.fwk_kernel.inputOutputAddr = reinterpret_cast<uintptr_t>(copy_ioaddr_dev_);
  794. aicpu_task.fwkKernelBase.fwk_kernel.workspaceBaseAddr = reinterpret_cast<uintptr_t>(copy_workspace_buf_);
  795. aicpu_task.fwkKernelBase.fwk_kernel.extInfoAddr = 0;
  796. aicpu_task.fwkKernelBase.fwk_kernel.extInfoLen = 0;
  797. GE_CHK_RT_RET(rtMemcpy(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL),
  798. &aicpu_task, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE));
  799. return SUCCESS;
  800. }
  801. Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
  802. const std::vector<DataBuffer> &input_buffers,
  803. std::vector<GeTensorDesc> &output_desc,
  804. std::vector<DataBuffer> &output_buffers,
  805. rtStream_t stream) {
  806. GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
  807. if (unknown_type_ == DEPEND_COMPUTE) {
  808. std::vector<DataBuffer> summary_buffers;
  809. for (size_t i = 0; i < num_outputs_; ++i) {
  810. summary_buffers.emplace_back(output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary), false);
  811. }
  812. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, summary_buffers));
  813. } else {
  814. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
  815. }
  816. GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  817. if (unknown_type_ == DEPEND_SHAPE_RANGE) {
  818. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  819. GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  820. } else if (unknown_type_ == DEPEND_COMPUTE) {
  821. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  822. GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, output_buffers, stream));
  823. }
  824. return SUCCESS;
  825. }
  826. Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam &param) {
  827. // aicpu do not have workspace, for now
  828. return DoUpdateArgTable(param, false);
  829. }
  830. const std::string &AiCpuBaseTask::GetTaskType() const { return kTaskTypeAicpu; }
  831. void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  832. arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data());
  833. arg_count = io_addr_host_.size();
  834. }
  835. void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
  836. args_ = std::move(args);
  837. arg_size_ = arg_size;
  838. // The blockdim value is defult "1" for rtCpuKernelLaunch
  839. block_dim_ = 1;
  840. }
  841. void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; }
  842. void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; }
  843. void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; }
  844. const void *AiCpuCCTask::GetArgs() const { return args_.get(); }
  845. size_t AiCpuCCTask::GetArgSize() const { return arg_size_; }
  846. AiCpuCCTask::~AiCpuCCTask() {
  847. }
  848. Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
  849. GELOGI("To invoke rtCpuKernelLaunch. block_dim = %u, so_name is %s, kernel_name is %s", block_dim_, so_name_.data(),
  850. kernel_name_.data());
  851. // sm_desc is nullptr, because l2 buffer does not support
  852. auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
  853. auto ret = rtCpuKernelLaunchWithFlag(static_cast<const void *>(so_name_.data()),
  854. static_cast<const void *>(kernel_name_.data()),
  855. block_dim_, args_.get(), static_cast<uint32_t>(arg_size_),
  856. sm_desc, stream, dump_flag_);
  857. if (ret != RT_ERROR_NONE) {
  858. GELOGE(ret, "[Invoke][rtCpuKernelLaunchWithFlag] failed. ret = %d.", ret);
  859. REPORT_CALL_ERROR("E19999", "invoke rtCpuKernelLaunchWithFlag failed, ret:%d.", ret);
  860. return RT_ERROR_TO_GE_STATUS(ret);
  861. }
  862. GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str());
  863. GELOGD("Invoke rtCpuKernelLaunch succeeded");
  864. return SUCCESS;
  865. }
  866. Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
  867. const std::vector<DataBuffer> &input_buffers,
  868. std::vector<GeTensorDesc> &output_desc,
  869. std::vector<DataBuffer> &output_buffers,
  870. rtStream_t stream) {
  871. GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
  872. GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
  873. GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
  874. if (unknown_type_ == DEPEND_SHAPE_RANGE) {
  875. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  876. GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
  877. } else if (unknown_type_ == DEPEND_COMPUTE) {
  878. GE_CHK_RT_RET(rtStreamSynchronize(stream));
  879. GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, output_buffers, stream));
  880. }
  881. return SUCCESS;
  882. }
  883. Status AiCpuCCTask::InitForSummaryAndCopy() {
  884. if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
  885. GELOGI("Unknown_type is %d, output num is %zu.", unknown_type_, num_outputs_);
  886. return SUCCESS;
  887. }
  888. output_summary_.resize(num_outputs_);
  889. constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
  890. for (size_t i = 0; i < num_outputs_; ++i) {
  891. GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
  892. }
  893. output_summary_host_.resize(num_outputs_);
  894. const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);
  895. GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  896. GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  897. GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  898. GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));
  899. std::vector<uint64_t> copy_io_addr;
  900. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
  901. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
  902. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
  903. copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));
  904. const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size();
  905. GE_CHK_RT_RET(rtMalloc(&copy_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM));
  906. GE_CHK_RT_RET(rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size,
  907. copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE));
  908. return SUCCESS;
  909. }
  910. Status AiCpuCCTask::SetMemCopyTask(const domi::KernelDef &kernel_def) {
  911. auto &memcpy_args = kernel_def.args();
  912. memcpy_args_size_ = kernel_def.args_size();
  913. memcpy_so_name_ = kernel_def.so_name();
  914. memcpy_kernel_name_ = kernel_def.kernel_name();
  915. GE_IF_BOOL_EXEC(memcpy_args.size() != memcpy_args_size_,
  916. REPORT_INNER_ERROR("E19999", "MemCopy task def args.size=%zu, but args_size=%u not equal.",
  917. memcpy_args.size(), memcpy_args_size_);
  918. GELOGE(FAILED, "[Check][Size]MemCopy task def args.size=%zu, but args_size=%u not equal.",
  919. memcpy_args.size(), memcpy_args_size_);
  920. return FAILED;);
  921. GE_IF_BOOL_EXEC(memcpy_args_size_ < sizeof(aicpu::AicpuParamHead),
  922. REPORT_INNER_ERROR("E19999",
  923. "Task def args_size=%u is less than aicpu param head len=%zu.",
  924. memcpy_args_size_, sizeof(aicpu::AicpuParamHead));
  925. GELOGE(FAILED,
  926. "[Check][Size] Task def args_size=%u is less than aicpu param head len=%zu.",
  927. memcpy_args_size_, sizeof(aicpu::AicpuParamHead));
  928. return FAILED;);
  929. memcpy_args_.reset(new(std::nothrow) uint8_t[memcpy_args_size_]());
  930. GE_IF_BOOL_EXEC(memcpy_args_ == nullptr,
  931. REPORT_INNER_ERROR("E19999", "new memory failed for Node[MemCopy], task_size[%u].",
  932. memcpy_args_size_);
  933. GELOGE(FAILED, "[Malloc][Memory] failed for Node[MemCopy], task_size[%u].",
  934. memcpy_args_size_);
  935. return FAILED;);
  936. errno_t sec_ret = memcpy_s(memcpy_args_.get(), memcpy_args_size_, memcpy_args.c_str(), memcpy_args.size());
  937. GE_IF_BOOL_EXEC(sec_ret != EOK,
  938. REPORT_INNER_ERROR("E19999",
  939. "memcpy_s argc_ failed for Node[MemCopy], ret: %d", sec_ret);
  940. GELOGE(INTERNAL_ERROR,
  941. "[Update][args] failed for Node[MemCopy], ret: %d", sec_ret);
  942. return sec_ret;);
  943. auto memcpy_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(memcpy_args_.get());
  944. uint32_t memcpy_io_num = memcpy_param_head->ioAddrNum;
  945. auto memcpy_io_addr = memcpy_args_.get() + sizeof(aicpu::AicpuParamHead);
  946. // if has input and output, need copy to ioaddr
  947. int cpy_ret = memcpy_s(memcpy_io_addr, memcpy_args_size_ - sizeof(aicpu::AicpuParamHead),
  948. &copy_ioaddr_dev_, sizeof(uint64_t) * memcpy_io_num);
  949. GE_IF_BOOL_EXEC(cpy_ret != 0,
  950. REPORT_INNER_ERROR("E19999", "Node[Memcpoy] memcpy io addr to AicpuParamHead failed,"
  951. "ret=%d, args_size=%u, io nums=%u.",
  952. cpy_ret, memcpy_args_size_, memcpy_io_num);
  953. GELOGE(INTERNAL_ERROR, "[Update][io_addr]Node[MemCopy] memcpy io addr to AicpuParamHead failed,"
  954. "ret=%d, args_size=%u, io nums=%u.",
  955. cpy_ret, memcpy_args_size_, memcpy_io_num);
  956. return INTERNAL_ERROR;);
  957. GELOGD("Set memcpy task for node[MemCopy] successfully.");
  958. return SUCCESS;
  959. }
  960. void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  961. arg_base = io_addr_;
  962. arg_count = io_addr_num_;
  963. }
  964. Status MemcpyAsyncTask::LaunchKernel(rtStream_t stream) {
  965. auto src_addr = reinterpret_cast<void *>(addresses_[0]);
  966. auto dst_addr = reinterpret_cast<void *>(addresses_[1]);
  967. kind_ = (kind_ == RT_MEMCPY_ADDR_DEVICE_TO_DEVICE) ? RT_MEMCPY_DEVICE_TO_DEVICE : kind_;
  968. GE_CHK_RT_RET(rtMemcpyAsync(dst_addr, dst_max_, src_addr, count_, kind_, stream));
  969. return SUCCESS;
  970. }
  971. void MemcpyAsyncTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
  972. arg_base = addresses_;
  973. arg_count = kMemcpyArgCount;
  974. }
  975. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示