You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

hybrid_model_async_executor.cc 17 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "hybrid/executor/hybrid_model_async_executor.h"
  17. #include "graph/load/new_model_manager/model_utils.h"
  18. #include "graph/utils/tensor_utils.h"
  19. #include "graph/utils/type_utils.h"
  20. #include "graph/ge_context.h"
  21. #include "omm/csa_interact.h"
  22. namespace ge {
  23. namespace hybrid {
  24. namespace {
  25. int kDataOutputIndex = 0;
  26. }
  27. HybridModelAsyncExecutor::HybridModelAsyncExecutor(HybridModel *model)
  28. : model_(model), run_flag_(false) {
  29. }
  30. HybridModelAsyncExecutor::~HybridModelAsyncExecutor() {
  31. if (stream_ != nullptr) {
  32. GE_CHK_RT(rtStreamDestroy(stream_));
  33. }
  34. }
  35. void HybridModelAsyncExecutor::SetDeviceId(uint32_t device_id) {
  36. device_id_ = device_id;
  37. }
  38. void HybridModelAsyncExecutor::SetModelId(uint32_t model_id) {
  39. model_id_ = model_id;
  40. }
  41. Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr<InputDataWrapper> &data) {
  42. GE_CHK_STATUS_EXEC(data_inputer_->Push(data), return domi::DATA_QUEUE_ISFULL,
  43. "Data queue is full, please call again later, model_id %u ", model_id_);
  44. GELOGD("EnqueueData successfully. model_id = %u, data_index = %u", data->GetInput().model_id, data->GetInput().index);
  45. return SUCCESS;
  46. }
  47. Status HybridModelAsyncExecutor::Start(const std::shared_ptr<ModelListener> &listener) {
  48. GELOGD("HybridModelExecutor::Start IN, listener = %p", listener.get());
  49. std::lock_guard<std::mutex> lk(mu_);
  50. GE_CHK_BOOL_RET_STATUS(!run_flag_, INTERNAL_ERROR, "Model already started.");
  51. run_flag_ = true;
  52. listener_ = listener;
  53. future_ = std::async([&]() -> Status {
  54. GetContext().SetSessionId(executor_->GetContext()->session_id);
  55. return RunInternal();
  56. });
  57. GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, "Failed to start.");
  58. GELOGD("HybridModelExecutor::Start successfully");
  59. return SUCCESS;
  60. }
  61. Status HybridModelAsyncExecutor::Stop() {
  62. std::lock_guard<std::mutex> lk(mu_);
  63. run_flag_ = false;
  64. data_inputer_->Stop();
  65. auto ret = future_.get();
  66. if (stream_ != nullptr) {
  67. GE_CHK_RT(rtStreamDestroy(stream_));
  68. stream_ = nullptr;
  69. }
  70. return ret;
  71. }
  72. Status HybridModelAsyncExecutor::Init() {
  73. data_inputer_ = std::unique_ptr<DataInputer>(new(std::nothrow) DataInputer());
  74. GE_CHECK_NOTNULL(data_inputer_);
  75. GE_CHK_RT_RET(rtStreamCreate(&stream_, RT_STREAM_PRIORITY_DEFAULT));
  76. executor_ = std::unique_ptr<HybridModelExecutor>(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_));
  77. GE_CHECK_NOTNULL(executor_);
  78. GE_CHK_STATUS_RET(executor_->Init(), "Failed to init hybrid engine");
  79. GE_CHK_STATUS_RET(InitInputTensors(), "Failed to init input tensors");
  80. return SUCCESS;
  81. }
  82. Status HybridModelAsyncExecutor::PreRun(InputData &current_data) {
  83. GE_CHK_STATUS_RET(SyncVarData(), "Failed to sync var data");
  84. RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[SyncVarData] End");
  85. GE_CHK_STATUS_RET(CopyInputData(current_data), "Failed to copy input data to model");
  86. RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[CopyInputData] End");
  87. return SUCCESS;
  88. }
  89. Status HybridModelAsyncExecutor::RunInternal() {
  90. auto device_id = static_cast<int32_t>(device_id_);
  91. GELOGD("Hybrid model start. model_id = %u, device_id = %u", model_id_, device_id_);
  92. GE_CHK_RT_RET(rtSetDevice(device_id));
  93. // DeviceReset before thread run finished!
  94. GE_MAKE_GUARD(not_used_var, [&] { GE_CHK_RT(rtDeviceReset(device_id)); });
  95. while (run_flag_) {
  96. std::shared_ptr<InputDataWrapper> data_wrapper;
  97. Status ret = data_inputer_->Pop(data_wrapper);
  98. if (data_wrapper == nullptr || ret != SUCCESS) {
  99. GELOGI("data_wrapper is null!, ret = %u", ret);
  100. continue;
  101. }
  102. GELOGI("Getting the input data, model_id:%u", model_id_);
  103. GE_IF_BOOL_EXEC(!run_flag_, break);
  104. InputData current_data = data_wrapper->GetInput();
  105. GELOGI("Model thread Run begin, model id:%u, data index:%u.", model_id_, current_data.index);
  106. HybridModelExecutor::ExecuteArgs args;
  107. args.inputs.resize(input_tensors_.size());
  108. for (auto &it : input_tensors_) {
  109. args.inputs[it.first] = it.second;
  110. }
  111. RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[RunInternal] [iteration = %d] Start", iterator_count_);
  112. ret = PreRun(current_data);
  113. GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
  114. ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput());
  115. CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
  116. continue, "PreRun failed."); // [No need to check value]
  117. ret = executor_->Execute(args);
  118. ret = HandleResult(ret, current_data.index, args, data_wrapper->GetOutput());
  119. if (ret != SUCCESS) {
  120. CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
  121. continue;
  122. }
  123. RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[RunInternal] [iteration = %d] End", iterator_count_);
  124. iterator_count_++;
  125. GELOGI("run iterator count is %lu", iterator_count_);
  126. }
  127. CsaInteract::GetInstance().WriteInternalErrorCode();
  128. GELOGI("Model run end, model id:%u", model_id_);
  129. return SUCCESS;
  130. }
  131. Status HybridModelAsyncExecutor::HandleResult(Status exec_ret,
  132. uint32_t data_id,
  133. HybridModelExecutor::ExecuteArgs &args,
  134. OutputData *output_data) {
  135. GELOGD("Start to handle result. model id = %u, data index = %u, execution ret = %u", model_id_, data_id, exec_ret);
  136. std::vector<ge::OutputTensorInfo> output_tensor_info_list;
  137. if (exec_ret == END_OF_SEQUENCE) {
  138. GELOGW("End of sequence, model id = %u", model_id_);
  139. return OnComputeDone(data_id, END_OF_SEQUENCE, output_tensor_info_list);
  140. }
  141. if (exec_ret != SUCCESS) {
  142. GELOGE(exec_ret, "Failed to execute graph. model_id = %u", model_id_);
  143. return OnComputeDone(data_id, INTERNAL_ERROR, output_tensor_info_list);
  144. }
  145. GE_CHECK_NOTNULL(output_data);
  146. auto ret = CopyOutputs(args, output_data, output_tensor_info_list);
  147. if (ret != SUCCESS) {
  148. OnComputeDone(data_id, INTERNAL_ERROR, output_tensor_info_list);
  149. return INTERNAL_ERROR;
  150. }
  151. GELOGD("Executed graph successfully, model id = %u, data_index = %u", model_id_, data_id);
  152. return OnComputeDone(data_id, SUCCESS, output_tensor_info_list);
  153. }
  154. Status HybridModelAsyncExecutor::SyncVarData() {
  155. GELOGI("Sync var data, model id:%u", model_id_);
  156. TensorValue *global_step_var = model_->GetVariable(NODE_NAME_GLOBAL_STEP);
  157. if (global_step_var != nullptr) {
  158. std::vector<uint64_t> v_step;
  159. v_step.push_back(iterator_count_);
  160. GE_CHK_RT_RET(rtMemcpy(global_step_var->MutableData(),
  161. global_step_var->GetSize(),
  162. v_step.data(),
  163. v_step.size() * sizeof(uint64_t),
  164. RT_MEMCPY_HOST_TO_DEVICE));
  165. } else {
  166. GELOGD("No GLOBAL_STEP variable was found.");
  167. }
  168. return SUCCESS;
  169. }
  170. Status HybridModelAsyncExecutor::CopyInputData(const InputData &current_data) {
  171. const std::vector<DataBuffer> &blobs = current_data.blobs;
  172. for (const auto &it : input_tensors_) {
  173. auto input_index = it.first;
  174. auto input_tensor = it.second;
  175. auto data_size = input_tensor.GetSize();
  176. GELOGD("To copy input data for input[%u]", input_index);
  177. if (input_index >= blobs.size()) {
  178. GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld",
  179. blobs.size(), model_->input_nodes_.size(), input_index, data_size);
  180. return FAILED;
  181. }
  182. const DataBuffer &data_buf = blobs[input_index];
  183. auto mem_size = static_cast<uint32_t>(data_size);
  184. GE_CHK_BOOL_RET_STATUS(mem_size >= data_buf.length,
  185. PARAM_INVALID,
  186. "input data size(%u) does not match model required size(%u), ret failed.",
  187. data_buf.length,
  188. mem_size);
  189. GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%u] datasize[%u]",
  190. model_->root_runtime_param_.graph_id, input_index, input_tensor.GetData(), mem_size, data_buf.length);
  191. GE_CHK_RT_RET(rtMemcpy(input_tensor.MutableData(),
  192. mem_size,
  193. data_buf.data,
  194. data_buf.length,
  195. RT_MEMCPY_HOST_TO_DEVICE));
  196. }
  197. return SUCCESS;
  198. }
  199. Status HybridModelAsyncExecutor::InitInputTensors() {
  200. auto allocator = NpuMemoryAllocator::GetAllocator(device_id_);
  201. GE_CHECK_NOTNULL(allocator);
  202. int input_index = 0;
  203. for (const auto &input_node : model_->GetRootGraphItem()->GetInputNodes()) {
  204. GELOGD("Init input[%u], node = %s", input_index, input_node->NodeName().c_str());
  205. auto output_desc = input_node->op_desc->GetOutputDescPtr(kDataOutputIndex);
  206. GE_CHECK_NOTNULL(output_desc);
  207. int64_t tensor_size = 0;
  208. GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetSize(*output_desc, tensor_size),
  209. "Failed to get size from %s",
  210. input_node->NodeName().c_str());
  211. if (tensor_size == 0) {
  212. GELOGW("[%s] Tensor size == 0", input_node->NodeName().c_str());
  213. GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*output_desc, tensor_size),
  214. "Failed to calc tensor size");
  215. GELOGD("[%s] Tensor size updated to %ld", input_node->NodeName().c_str(), tensor_size);
  216. }
  217. auto buffer = TensorBuffer::Create(allocator, tensor_size);
  218. GE_CHECK_NOTNULL(buffer);
  219. TensorValue tensor(shared_ptr<TensorBuffer>(buffer.release()));
  220. tensor.SetName("Input_" + input_node->NodeName());
  221. input_tensors_.emplace(input_index, tensor);
  222. input_index += 1;
  223. }
  224. return SUCCESS;
  225. }
  226. Status HybridModelAsyncExecutor::OnComputeDone(uint32_t data_index, uint32_t result_code,
  227. std::vector<ge::OutputTensorInfo> &outputs) {
  228. GELOGD("OnComputeDone. model id = %u, data index = %u, execution ret = %u", model_id_, data_index, result_code);
  229. if (listener_ != nullptr) {
  230. GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_index, result_code, outputs),
  231. "OnComputeDone failed");
  232. }
  233. return result_code;
  234. }
  235. Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &args,
  236. OutputData *output_data,
  237. std::vector<ge::OutputTensorInfo> &outputs) {
  238. // copy output data from op to designated position
  239. std::vector<ConstGeTensorDescPtr> &output_tensor_desc_list = args.output_desc;
  240. std::vector<TensorValue> &output_tensors = args.outputs;
  241. if (output_tensor_desc_list.size() != output_tensors.size()) {
  242. GELOGE(INTERNAL_ERROR,
  243. "Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu",
  244. output_tensor_desc_list.size(),
  245. output_tensors.size());
  246. return INTERNAL_ERROR;
  247. }
  248. GELOGD("Number of outputs = %zu", output_tensor_desc_list.size());
  249. for (size_t i = 0; i < output_tensors.size(); ++i) {
  250. GELOGD("Start to process output[%zu]", i);
  251. auto &output_tensor = output_tensors[i];
  252. auto &tensor_desc = output_tensor_desc_list.at(i);
  253. GE_CHECK_NOTNULL(tensor_desc);
  254. int64_t output_size = -1;
  255. GE_CHK_GRAPH_STATUS_RET(TensorUtils::CalcTensorMemSize(tensor_desc->GetShape(),
  256. tensor_desc->GetFormat(),
  257. tensor_desc->GetDataType(),
  258. output_size),
  259. "Failed to calc tensor size for output[%zu]. shape = [%s], type = %s, format = %s",
  260. i,
  261. tensor_desc->GetShape().ToString().c_str(),
  262. TypeUtils::DataTypeToSerialString(tensor_desc->GetDataType()).c_str(),
  263. TypeUtils::FormatToSerialString(tensor_desc->GetFormat()).c_str());
  264. GELOGD("Got tensor size for output[%zu] successfully. shape = [%s], type = %s, format = %s, size = %ld",
  265. i,
  266. tensor_desc->GetShape().ToString().c_str(),
  267. TypeUtils::DataTypeToSerialString(tensor_desc->GetDataType()).c_str(),
  268. TypeUtils::FormatToSerialString(tensor_desc->GetFormat()).c_str(),
  269. output_size);
  270. GE_CHECK_GE(output_size, 0);
  271. GE_CHECK_LE(output_size, UINT32_MAX);
  272. if (output_tensor.GetSize() < static_cast<size_t>(output_size)) {
  273. GELOGE(INTERNAL_ERROR,
  274. "output[%zu] tensor size(%zu) is not enough for output shape [%s]",
  275. i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str());
  276. return INTERNAL_ERROR;
  277. }
  278. ge::OutputTensorInfo output;
  279. output.data_type = static_cast<uint32_t>(tensor_desc->GetDataType());
  280. output.dims = tensor_desc->GetShape().GetDims();
  281. output.length = output_size;
  282. if (output_size > 0) {
  283. std::unique_ptr<uint8_t[]> data_buf(new(std::nothrow) uint8_t[output_size]);
  284. GE_CHECK_NOTNULL(data_buf);
  285. GE_CHK_RT_RET(rtMemcpy(data_buf.get(),
  286. output_size,
  287. output_tensor.GetData(),
  288. output_size,
  289. RT_MEMCPY_DEVICE_TO_HOST));
  290. output.data = std::move(data_buf);
  291. output_data->blobs.emplace_back(data_buf.get(), static_cast<uint32_t>(output_size), false);
  292. } else {
  293. GELOGW("Output[%zu] is empty. shape = [%s]", i, tensor_desc->GetShape().ToString().c_str());
  294. output.data = nullptr;
  295. output_data->blobs.emplace_back(nullptr, 0U, false);
  296. }
  297. outputs.emplace_back(std::move(output));
  298. GELOGD("Output[%zu] added, type = %s, shape = [%s], size = %ld",
  299. i,
  300. TypeUtils::DataTypeToSerialString(tensor_desc->GetDataType()).c_str(),
  301. tensor_desc->GetShape().ToString().c_str(),
  302. output_size);
  303. }
  304. return SUCCESS;
  305. }
  306. Status HybridModelAsyncExecutor::Execute(const vector<GeTensor> &inputs, vector<GeTensor> &outputs) {
  307. GELOGD("Start to execute model.");
  308. // prepare inputs
  309. InputData input_data;
  310. for (auto &tensor : inputs) {
  311. DataBuffer buffer;
  312. buffer.data = const_cast<uint8_t *>(tensor.GetData().GetData());
  313. buffer.length = tensor.GetData().size();
  314. input_data.blobs.emplace_back(buffer);
  315. }
  316. GE_CHK_STATUS_RET(CopyInputData(input_data), "Failed to copy input data to model");
  317. GELOGD("Done copying input data successfully.");
  318. HybridModelExecutor::ExecuteArgs args;
  319. args.inputs.resize(input_tensors_.size());
  320. args.input_desc.resize(input_tensors_.size());
  321. for (auto &it : input_tensors_) {
  322. args.inputs[it.first] = it.second;
  323. args.input_desc[it.first] = MakeShared<GeTensorDesc>(inputs[it.first].GetTensorDesc());
  324. }
  325. GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model.");
  326. std::vector<ge::OutputTensorInfo> output_tensor_info_list;
  327. OutputData output_data;
  328. GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), "Failed to copy outputs.");
  329. GELOGD("Done copying output data successfully. output count = %zu", output_tensor_info_list.size());
  330. int out_index = 0;
  331. outputs.resize(output_tensor_info_list.size());
  332. for (auto &out_tensor_info : output_tensor_info_list) {
  333. auto &ge_tensor = outputs[out_index];
  334. if (out_tensor_info.length > 0) {
  335. GE_CHK_GRAPH_STATUS_RET(ge_tensor.SetData(out_tensor_info.data.get(), out_tensor_info.length),
  336. "Failed to set output[%d].", out_index);
  337. }
  338. ge_tensor.MutableTensorDesc() = *args.output_desc[out_index];
  339. GELOGD("Set output[%d], tensor size = %ld, shape = [%s]",
  340. out_index,
  341. out_tensor_info.length,
  342. ge_tensor.MutableTensorDesc().MutableShape().ToString().c_str());
  343. ++out_index;
  344. }
  345. return SUCCESS;
  346. }
  347. } // namespace hybrid
  348. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示