You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

davinci_model.h 32 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
  17. #define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
  18. #include <map>
  19. #include <memory>
  20. #include <set>
  21. #include <string>
  22. #include <thread>
  23. #include <vector>
  24. #include "common/ge_types.h"
  25. #include "common/helper/model_helper.h"
  26. #include "common/helper/om_file_helper.h"
  27. #include "common/opskernel/ge_task_info.h"
  28. #include "common/properties_manager.h"
  29. #include "common/types.h"
  30. #include "framework/common/util.h"
  31. #include "graph/debug/ge_attr_define.h"
  32. #include "graph/load/new_model_manager/aipp_utils.h"
  33. #include "graph/load/new_model_manager/data_dumper.h"
  34. #include "graph/load/new_model_manager/data_inputer.h"
  35. #include "graph/load/new_model_manager/model_utils.h"
  36. #include "graph/load/new_model_manager/zero_copy_offset.h"
  37. #include "graph/load/new_model_manager/zero_copy_task.h"
  38. #include "graph/model.h"
  39. #include "graph/node.h"
  40. #include "graph/op_desc.h"
  41. #include "graph/operator.h"
  42. #include "graph/utils/attr_utils.h"
  43. #include "graph/utils/tensor_utils.h"
  44. #include "mmpa/mmpa_api.h"
  45. #include "proto/task.pb.h"
  46. #include "task_info/task_info.h"
  47. #include "graph/common/local_context.h"
  48. namespace ge {
  49. // op debug need 2048 bits buffer
  50. const size_t kOpDebugMemorySize = 2048UL;
  51. const size_t kDebugP2pSize = 8UL;
  52. typedef enum tagModelProcStage {
  53. MODEL_LOAD_START = 1,
  54. MODEL_LOAD_END,
  55. MODEL_PRE_PROC_START,
  56. MODEL_PRE_PROC_END,
  57. MODEL_INFER_START,
  58. MODEL_INFER_END,
  59. MODEL_AFTER_PROC_START,
  60. MODEL_AFTER_PROC_END,
  61. MODEL_PROC_INVALID,
  62. } ModelProcStage;
  63. struct timeInfo {
  64. uint32_t modelId;
  65. int64_t processBeginTime;
  66. int64_t processEndTime;
  67. int64_t inferenceBeginTime;
  68. int64_t inferenceEndTime;
  69. int64_t dumpBeginTime;
  70. int64_t dumpEndTime;
  71. };
  72. // For super kernel
  73. struct SuperKernelTaskInfo {
  74. uint32_t last_block_dim;
  75. uint32_t last_args_size;
  76. uint32_t last_task_id;
  77. uint32_t last_stream_id;
  78. void *last_stream;
  79. void *last_sm_desc;
  80. std::vector<void *> kernel_list;
  81. std::vector<void *> arg_list;
  82. std::vector<uint32_t> dump_flag_list;
  83. std::vector<OpDescPtr> op_desc_list;
  84. std::vector<uintptr_t> dump_args_list;
  85. uint32_t last_dump_flag;
  86. int64_t last_group_key;
  87. uintptr_t last_dump_args;
  88. OpDescPtr last_op;
  89. };
  90. struct TaskMemInfo {
  91. int64_t input_size{0};
  92. int64_t output_size{0};
  93. int64_t weight_size{0};
  94. int64_t workspace_size{0};
  95. int64_t total_size{0};
  96. };
  97. struct ProfileInfo {
  98. FusionOpInfo fusion_info;
  99. TaskMemInfo memory_info;
  100. uint32_t task_count{0};
  101. };
  102. enum ExecuteMode {
  103. INITIALIZATION,
  104. SYNCHRONIZATION,
  105. ASYNCHRONIZATION,
  106. };
  107. // comments
  108. class DavinciModel {
  109. public:
  110. ///
  111. /// @ingroup ge
  112. /// @brief DavinciModel constructor
  113. /// @author
  114. ///
  115. DavinciModel(int32_t priority, const std::shared_ptr<ModelListener> &listener);
  116. ///
  117. /// @ingroup ge
  118. /// @brief DavinciModel desctructor, free Parse and Init resources
  119. /// @author
  120. ///
  121. ~DavinciModel();
  122. ///
  123. /// @ingroup ge
  124. /// @brief apply model to model_def_
  125. ///
  126. Status Assign(const GeModelPtr &ge_model);
  127. ///
  128. /// @ingroup ge
  129. /// @brief DavinciModel initialization, including Stream, ccHandle, Event, DataInputer, etc
  130. /// @return execute result
  131. /// @author
  132. ///
  133. Status Init(void *dev_ptr = nullptr, size_t memsize = 0, void *weight_ptr = nullptr, size_t weightsize = 0);
  134. ///
  135. /// @ingroup ge
  136. /// @brief ACL case, Load task list with queue.
  137. /// @param [in] input_que_ids: input queue ids from user, nums equal Data Op.
  138. /// @param [in] output_que_ids: input queue ids from user, nums equal NetOutput Op.
  139. /// @return: 0 for success / others for fail
  140. ///
  141. Status SetQueIds(const std::vector<uint32_t> &input_queue_ids, const std::vector<uint32_t> &output_queue_ids);
  142. ///
  143. /// @ingroup ge
  144. /// @brief Get DataInputer
  145. /// @return model ID
  146. ///
  147. uint32_t Id() const { return model_id_; }
  148. ///
  149. /// @ingroup ge
  150. /// @brief Get DataInputer
  151. /// @return model ID
  152. ///
  153. void SetId(uint32_t model_id) { model_id_ = model_id; }
  154. static void *Run(DavinciModel *model_pointer);
  155. ///
  156. /// @ingroup ge
  157. /// @brief NnExecute
  158. /// @param [in] stream execute stream
  159. /// @param [in] async_mode is asynchronize mode.
  160. /// @param [in] input_data model input data
  161. /// @param [out] output_data model output data
  162. ///
  163. Status NnExecute(rtStream_t stream, bool async_mode, const InputData &input_data, OutputData &output_data);
  164. ///
  165. /// @ingroup ge
  166. /// @brief lock mutex run flag
  167. /// @author
  168. ///
  169. void LockRunFlg() { mux_run_flg_.lock(); }
  170. ///
  171. /// @ingroup ge
  172. /// @brief unlock mutex run flag
  173. /// @author
  174. ///
  175. void UnlockRunFlg() { mux_run_flg_.unlock(); }
  176. ///
  177. /// @ingroup ge
  178. /// @brief get DataInputer
  179. /// @return DataInputer pointer
  180. ///
  181. DataInputer *const GetDataInputer() const { return data_inputer_; }
  182. // get Stream number
  183. uint32_t StreamNum() const { return runtime_param_.stream_num; }
  184. // get Event number
  185. uint32_t EventNum() const { return runtime_param_.event_num; }
  186. // get Lable number
  187. uint32_t LabelNum() const { return runtime_param_.label_num; }
  188. // get batch number
  189. uint32_t BatchNum() const { return runtime_param_.batch_num; }
  190. // get session id
  191. uint64_t SessionId() const { return runtime_param_.session_id; }
  192. // get model priority
  193. int32_t Priority() const { return priority_; }
  194. // get total mem size
  195. size_t TotalMemSize() const { return runtime_param_.mem_size; }
  196. const std::map<uint32_t, MemInfo> &P2PMemInfos() const {return runtime_param_.memory_infos;}
  197. // model name
  198. string Name() const { return name_; }
  199. // om_name
  200. string OmName() const { return om_name_; }
  201. // version
  202. uint32_t Version() const { return version_; }
  203. // get total weights mem size
  204. size_t TotalWeightsMemSize() const { return runtime_param_.weight_size; }
  205. size_t TotalVarMemSize() const { return runtime_param_.var_size; }
  206. // get base memory address
  207. uint8_t *MemBase() { return mem_base_; }
  208. // get weight base memory address
  209. uint8_t *WeightsMemBase() { return weights_mem_base_; }
  210. uint8_t *VarMemBase() { return var_mem_base_; }
  211. // get Event list
  212. const vector<rtEvent_t> &GetEventList() const { return event_list_; }
  213. const vector<rtStream_t> &GetStreamList() const { return stream_list_; }
  214. const vector<rtLabel_t> &GetLabelList() const { return label_list_; }
  215. Status DestroyThread();
  216. // Get Data Op.
  217. const vector<OpDescPtr> &GetDataList() const { return data_op_list_; }
  218. // get Op
  219. OpDescPtr GetOpByIndex(uint32_t index) const {
  220. if (op_list_.find(index) == op_list_.end()) {
  221. return nullptr;
  222. }
  223. return op_list_.at(index);
  224. }
  225. OpDescPtr GetVariableOp(const string &name) {
  226. for (auto op_desc : variable_op_list_) {
  227. if (op_desc != nullptr && op_desc->GetName() == name) {
  228. return op_desc;
  229. }
  230. }
  231. return nullptr;
  232. }
  233. // get task info for profiling
  234. const std::vector<TaskDescInfo> &GetTaskDescInfo() const { return task_desc_info_; }
  235. // get updated task info list
  236. std::vector<TaskInfoPtr> GetTaskList() { return task_list_; }
  237. // Modified from KernelTaskInfo.
  238. SuperKernelTaskInfo &GetSuperKernelTaskInfo() { return skt_info_; }
  239. ///
  240. /// @ingroup ge
  241. /// @brief get model input and output format
  242. /// @return ccTensorFormat_t current model input and output format
  243. ///
  244. Format GetFormat();
  245. rtModel_t GetRtModelHandle() const { return rt_model_handle_; }
  246. rtStream_t GetRtModelStream() const { return rt_model_stream_; }
  247. uint64_t GetRtBaseAddr() const { return runtime_param_.logic_mem_base; }
  248. uint64_t GetRtWeightAddr() const { return runtime_param_.logic_weight_base; }
  249. uint64_t GetRtVarAddr() const { return runtime_param_.logic_var_base; }
  250. uint32_t GetFlowctrlIndex(uint32_t op_index);
  251. void PushHcclStream(rtStream_t value);
  252. bool IsBroadCastOpData(const NodePtr &var_node);
  253. ///
  254. /// @ingroup ge
  255. /// @brief For TVM Op, avoid Addr Reuse.
  256. /// @return void*
  257. ///
  258. const char *GetRegisterStub(const string &tvm_binfile_key, const string &session_graph_model_id = "");
  259. ///
  260. /// @ingroup ge
  261. /// @brief get model input and output desc info
  262. /// @param [out] input_shape model input size
  263. /// @param [out] output_shape model output size
  264. /// @return execute result
  265. ///
  266. Status GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<InputOutputDescInfo> &output_desc);
  267. Status GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<InputOutputDescInfo> &output_desc,
  268. std::vector<uint32_t> &inputFormats, std::vector<uint32_t> &output_formats);
  269. ///
  270. /// @ingroup ge
  271. /// @brief Get dynamic batch_info
  272. /// @param [out] batch_info
  273. /// @param [out] dynamic_type
  274. /// @return execute result
  275. ///
  276. Status GetDynamicBatchInfo(std::vector<std::vector<int64_t>> &batch_info, int32_t &dynamic_type) const;
  277. ///
  278. /// @ingroup ge
  279. /// @brief Get combined dynamic dims info
  280. /// @param [out] batch_info
  281. /// @return None
  282. ///
  283. void GetCombinedDynamicDims(std::vector<std::vector<int64_t>> &batch_info) const;
  284. void GetUserDesignateShapeOrder(std::vector<std::string> &user_input_shape_order) const;
  285. void GetCurShape(std::vector<int64_t> &batch_info, int32_t &dynamic_type);
  286. void GetModelAttr(std::vector<std::string> &dynamic_output_shape_info);
  287. ///
  288. /// @ingroup ge
  289. /// @brief Get AIPP input info
  290. /// @param [in] index
  291. /// @param [out] aipp_info
  292. /// @return execute result
  293. ///
  294. Status GetAIPPInfo(uint32_t index, AippConfigInfo &aipp_info);
  295. Status GetAippType(uint32_t index, InputAippType &type, size_t &aipp_index);
  296. ///
  297. /// @ingroup ge
  298. /// @brief Get model_id.
  299. /// @return model_id
  300. ///
  301. uint32_t GetModelId() const { return model_id_; }
  302. ///
  303. /// @ingroup ge
  304. /// @brief get unique identification for op when load two or more models
  305. /// @param [in] op_desc : current op.
  306. /// @param [in] string identification: unique identification for current op.
  307. /// @return None
  308. ///
  309. void GetUniqueId(const OpDescPtr &op_desc, std::string &unique_identification);
  310. ///
  311. /// @ingroup ge
  312. /// @brief get model input and output desc for zero copy
  313. /// @param [out] input_shape model input size
  314. /// @param [out] output_shape model output size
  315. /// @return execute result
  316. ///
  317. Status GetInputOutputDescInfoForZeroCopy(vector<InputOutputDescInfo> &input_desc,
  318. vector<InputOutputDescInfo> &output_desc,
  319. std::vector<uint32_t> &inputFormats, std::vector<uint32_t> &output_formats);
  320. Status ReturnResult(uint32_t data_id, const bool rslt_flg, const bool seq_end_flg, OutputData *output_data);
  321. Status ReturnNoOutput(uint32_t data_id);
  322. Status ModelRunStart();
  323. ///
  324. /// @ingroup ge
  325. /// @brief stop run model
  326. /// @return Status
  327. ///
  328. Status ModelRunStop();
  329. ///
  330. /// @ingroup ge
  331. /// @brief model run flag
  332. /// @return Status
  333. ///
  334. bool RunFlag() const { return run_flg_; }
  335. Status GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats);
  336. ///
  337. /// @ingroup ge
  338. /// @brief Set Session Id
  339. /// @return void
  340. ///
  341. void SetSessionId(uint64_t session_id) { session_id_ = session_id; }
  342. ///
  343. /// @ingroup ge
  344. /// @brief Get Session Id
  345. /// @return sessionID
  346. ///
  347. uint64_t GetSessionId() const { return session_id_; }
  348. ///
  349. /// @ingroup ge
  350. /// @brief SetDeviceId
  351. /// @return void
  352. ///
  353. void SetDeviceId(uint32_t device_id) { device_id_ = device_id; }
  354. ///
  355. /// @ingroup ge
  356. /// @brief Get device Id
  357. /// @return device id
  358. ///
  359. uint32_t GetDeviceId() const { return device_id_; }
  360. bool NeedDestroyAicpuKernel() const { return need_destroy_aicpu_kernel_; }
  361. Status UpdateSessionId(uint64_t session_id);
  362. const RuntimeParam &GetRuntimeParam() { return runtime_param_; }
  363. int32_t GetDataInputTid() const { return dataInputTid; }
  364. void SetDataInputTid(int32_t data_input_tid) { dataInputTid = data_input_tid; }
  365. void DisableZeroCopy(const void *addr);
  366. bool GetOpDugReg() const { return is_op_debug_reg_; }
  367. ///
  368. /// @ingroup ge
  369. /// @brief Save outside address of Data or NetOutput used info for ZeroCopy.
  370. /// @param [in] const OpDescPtr &op_desc: current op desc
  371. /// @param [in] const std::vector<void *> &outside_addrs: address of task
  372. /// @param [in] const void *args_offset: arguments address save the address.
  373. /// @return None.
  374. ///
  375. void SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector<void *> &outside_addrs, const void *info, void *args,
  376. size_t size, size_t offset);
  377. void SetDynamicSize(const std::vector<uint64_t> &batch_num, int32_t dynamic_type);
  378. bool GetL1FusionEnableOption() { return is_l1_fusion_enable_; }
  379. void SetProfileTime(ModelProcStage stage, int64_t endTime = 0);
  380. int64_t GetLoadBeginTime() { return load_begin_time_; }
  381. int64_t GetLoadEndTime() { return load_end_time_; }
  382. Status ReportProfilingData();
  383. void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) {
  384. data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id);
  385. }
  386. void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr<OpDesc> &op_desc, uintptr_t args) {
  387. data_dumper_.SaveDumpTask(task_id, stream_id, op_desc, args);
  388. }
  389. void SetEndGraphId(uint32_t task_id, uint32_t stream_id);
  390. DavinciModel &operator=(const DavinciModel &model) = delete;
  391. DavinciModel(const DavinciModel &model) = delete;
  392. const map<int64_t, std::vector<rtStream_t>> &GetHcclFolowStream() {
  393. return main_follow_stream_mapping_;
  394. }
  395. void SaveHcclFollowStream(int64_t main_stream_id, rtStream_t stream);
  396. void InitRuntimeParams();
  397. Status InitVariableMem();
  398. void UpdateMemBase(uint8_t *mem_base) {
  399. runtime_param_.mem_base = mem_base;
  400. mem_base_ = mem_base;
  401. }
  402. void SetTotalArgsSize(uint32_t args_size) { total_args_size_ += args_size; }
  403. uint32_t GetTotalArgsSize() { return total_args_size_; }
  404. void *GetCurrentArgsAddr(uint32_t offset) {
  405. void *cur_args = static_cast<char *>(args_) + offset;
  406. return cur_args;
  407. }
  408. void SetTotalIOAddrs(const vector<void *> &io_addrs);
  409. void SetHybridArgsSize(uint32_t args_size) { total_hybrid_args_size_ += args_size; }
  410. uint32_t GetHybridArgsSize() {
  411. return total_hybrid_args_size_;
  412. }
  413. void *GetCurrentHybridArgsAddr(uint32_t offset) {
  414. void *cur_args = static_cast<char *>(hybrid_addrs_) + offset;
  415. return cur_args;
  416. }
  417. void SetTotalFixedAddrsSize(string tensor_name, int64_t fix_addr_size);
  418. int64_t GetFixedAddrsSize(string tensor_name);
  419. void *GetCurrentFixedAddr(int64_t offset) const {
  420. void *cur_addr = static_cast<char *>(fixed_addrs_) + offset;
  421. return cur_addr;
  422. }
  423. uint32_t GetFixedAddrOutputIndex(string tensor_name) {
  424. if (tensor_name_to_peer_output_index_.find(tensor_name) != tensor_name_to_peer_output_index_.end()) {
  425. return tensor_name_to_peer_output_index_[tensor_name];
  426. }
  427. return UINT32_MAX;
  428. }
  429. void SetKnownNode(bool known_node) { known_node_ = known_node; }
  430. bool IsKnownNode() { return known_node_; }
  431. Status MallocKnownArgs();
  432. Status UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs);
  433. Status CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs);
  434. Status UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs);
  435. void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; }
  436. Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info);
  437. Status GetAllAippInputOutputDims(uint32_t index, std::vector<InputOutputDims> &input_dims,
  438. std::vector<InputOutputDims> &output_dims);
  439. void SetModelDescVersion(bool is_new_model_desc) { is_new_model_desc_ = is_new_model_desc; }
  440. // om file name
  441. void SetOmName(string om_name) { om_name_ = om_name; }
  442. void SetDumpProperties(const DumpProperties &dump_properties) { data_dumper_.SetDumpProperties(dump_properties); }
  443. const DumpProperties &GetDumpProperties() const { return data_dumper_.GetDumpProperties(); }
  444. bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
  445. return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info);
  446. }
  447. Status InitInputOutputForDynamic(const ComputeGraphPtr &compute_graph);
  448. private:
  449. // memory address of weights
  450. uint8_t *weights_mem_base_;
  451. uint8_t *var_mem_base_;
  452. // memory address of model
  453. uintptr_t fixed_mem_base_; // Initial of mem_base_, keep forever.
  454. uint8_t *mem_base_;
  455. uint8_t *p2p_mem_base_;
  456. bool is_inner_mem_base_;
  457. bool is_inner_weight_base_;
  458. bool is_inner_p2p_mem_base_;
  459. // input data manager
  460. DataInputer *data_inputer_;
  461. int64_t load_begin_time_;
  462. int64_t load_end_time_;
  463. struct timeInfo time_info_;
  464. int32_t dataInputTid;
  465. ///
  466. /// @ingroup ge
  467. /// @brief Copy Check input size and model op size.
  468. /// @param [in] const int64_t &input_size: input size.
  469. /// @param [in] const int64_t &op_size: model op size.
  470. /// @param [in] is_dynamic: dynamic batch input flag.
  471. /// @return true if success
  472. ///
  473. bool CheckInputAndModelSize(const int64_t &input_size, const int64_t &op_size, bool is_dynamic);
  474. ///
  475. /// @ingroup ge
  476. /// @brief Set copy only for No task feed NetOutput address.
  477. /// @return None.
  478. ///
  479. void SetCopyOnlyOutput();
  480. ///
  481. /// @ingroup ge
  482. /// @brief Copy Input/Output to model for direct use.
  483. /// @param [in] const InputData &input_data: user input data info.
  484. /// @param [in/out] OutputData &output_data: user output data info.
  485. /// @param [in] bool is_dynamic: whether is dynamic input, true: is dynamic input; false: not is dynamic input
  486. /// @return SUCCESS handle successfully / others handle failed
  487. ///
  488. Status CopyModelData(const InputData &input_data, OutputData &output_data, bool is_dynamic);
  489. ///
  490. /// @ingroup ge
  491. /// @brief Copy Data addr to model for direct use.
  492. /// @param [in] data_info: model memory addr/size map { data_index, { tensor_size, tensor_addr } }.
  493. /// @param [in] is_input: input data or output data
  494. /// @param [in] blobs: user input/output data list.
  495. /// @param [in] is_dynamic: whether is dynamic input, true: is dynamic input; false: not is dynamic input
  496. /// @param [in] batch_label: batch label for multi-batch scenes
  497. /// @return SUCCESS handle successfully / others handle failed
  498. ///
  499. Status UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &data_info, bool is_input,
  500. const vector<DataBuffer> &blobs, bool is_dynamic, const string &batch_label);
  501. Status CopyInputData(const InputData &input_data, bool device_data = false);
  502. Status CopyOutputData(uint32_t data_id, OutputData &output_data, rtMemcpyKind_t kind);
  503. Status SyncVarData();
  504. Status InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weight_size);
  505. Status InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size);
  506. void CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input);
  507. void SetInputDimsInfo(const vector<int64_t> &model_input_dims, Format &format, InputOutputDescInfo &input);
  508. Status GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, std::vector<uint32_t> &formats);
  509. Status InitTaskInfo(domi::ModelTaskDef &modelTaskInfo);
  510. void UnbindHcomStream();
  511. Status DistributeTask();
  512. uint8_t *MallocFeatureMapMem(size_t data_size);
  513. uint8_t *MallocWeightsMem(size_t weights_size);
  514. uint8_t* MallocP2PMem(size_t p2p_data_size);
  515. void FreeFeatureMapMem();
  516. void FreeWeightsMem();
  517. void FreeP2PMem();
  518. void ReleaseTask();
  519. void UnbindTaskSinkStream();
  520. bool IsAicpuKernelConnectSpecifiedLayer();
  521. ///
  522. /// @ingroup ge
  523. /// @brief Reduce memory usage after task sink.
  524. /// @return: void
  525. ///
  526. void Shrink();
  527. ///
  528. /// @ingroup ge
  529. /// @brief Travel all nodes and do some init.
  530. /// @param [in] compute_graph: ComputeGraph to load.
  531. /// @return Status
  532. ///
  533. Status InitNodes(const ComputeGraphPtr &compute_graph);
  534. ///
  535. /// @ingroup ge
  536. /// @brief Data Op Initialize.
  537. /// @param [in] NodePtr: Data Op.
  538. /// @param [in/out] data_op_index: NetOutput addr size info.
  539. /// @return Status
  540. ///
  541. Status InitDataOp(const NodePtr &node, uint32_t &data_op_index, map<uint32_t, OpDescPtr> &data_by_index);
  542. ///
  543. /// @ingroup ge
  544. /// @brief Sort Data op list by index.
  545. /// @param [in] data_by_index: map of Data Op.
  546. /// @return
  547. ///
  548. void AdjustDataOpList(const map<uint32_t, OpDescPtr> &data_by_index);
  549. ///
  550. /// @ingroup ge
  551. /// @brief NetOutput Op Initialize.
  552. /// @param [in] NodePtr: NetOutput Op.
  553. /// @return Status
  554. ///
  555. Status InitNetOutput(const NodePtr &node);
  556. ///
  557. /// @ingroup ge
  558. /// @brief Constant Op Init.
  559. /// @return Status
  560. ///
  561. Status InitConstant(const OpDescPtr &op_desc);
  562. Status InitVariable(const OpDescPtr &op_desc);
  563. /// @ingroup ge
  564. /// @brief LabelSet Op Initialize.
  565. /// @param [in] op_desc: LabelSet Op descriptor.
  566. /// @return Status
  567. Status InitLabelSet(const OpDescPtr &op_desc);
  568. Status InitStreamSwitch(const OpDescPtr &op_desc);
  569. Status InitStreamActive(const OpDescPtr &op_desc);
  570. Status InitStreamSwitchN(const OpDescPtr &op_desc);
  571. ///
  572. /// @ingroup ge
  573. /// @brief Case Op Init.
  574. /// @return Status
  575. ///
  576. Status InitCase(const OpDescPtr &op_desc);
  577. Status SetDynamicBatchInfo(const OpDescPtr &op_desc, uint32_t batch_num);
  578. ///
  579. /// @ingroup ge
  580. /// @brief TVM Op Init.
  581. /// @return Status
  582. ///
  583. Status InitTbeHandle(const OpDescPtr &op_desc);
  584. void StoreTbeHandle(const std::string &handle_key);
  585. void CleanTbeHandle();
  586. ///
  587. /// @ingroup ge
  588. /// @brief Make active stream list and bind to model.
  589. /// @return: 0 for success / others for fail
  590. ///
  591. Status BindModelStream();
  592. ///
  593. /// @ingroup ge
  594. /// @brief Init model stream for NN model.
  595. /// @return Status
  596. ///
  597. Status InitModelStream(rtStream_t stream);
  598. ///
  599. /// @ingroup ge
  600. /// @brief ACL, Load task list with queue entrance.
  601. /// @return: 0 for success / others for fail
  602. ///
  603. Status LoadWithQueue();
  604. ///
  605. /// @ingroup ge
  606. /// @brief ACL, Bind Data Op addr to input queue.
  607. /// @return: 0 for success / others for fail
  608. ///
  609. Status BindInputQueue();
  610. Status CpuTaskModelZeroCopy(std::vector<uintptr_t> &mbuf_list, std::map<const void *, ZeroCopyOffset> &outside_addrs);
  611. ///
  612. /// @ingroup ge
  613. /// @brief ACL, Bind NetOutput Op addr to output queue.
  614. /// @return: 0 for success / others for fail
  615. ///
  616. Status BindOutputQueue();
  617. Status CpuModelPrepareOutput(uintptr_t addr, uint32_t size);
  618. ///
  619. /// @ingroup ge
  620. /// @brief definiteness queue schedule, bind input queue to task.
  621. /// @param [in] queue_id: input queue id from user.
  622. /// @param [in] addr: Data Op output tensor address.
  623. /// @param [in] size: Data Op output tensor size.
  624. /// @return: 0 for success / others for fail
  625. ///
  626. Status CpuModelDequeue(uint32_t queue_id);
  627. ///
  628. /// @ingroup ge
  629. /// @brief definiteness queue schedule, bind output queue to task.
  630. /// @param [in] queue_id: output queue id from user.
  631. /// @param [in] addr: NetOutput Op input tensor address.
  632. /// @param [in] size: NetOutput Op input tensor size.
  633. /// @return: 0 for success / others for fail
  634. ///
  635. Status CpuModelEnqueue(uint32_t queue_id, uintptr_t addr, uint32_t size);
  636. ///
  637. /// @ingroup ge
  638. /// @brief definiteness queue schedule, active original model stream.
  639. /// @return: 0 for success / others for fail
  640. ///
  641. Status CpuActiveStream();
  642. ///
  643. /// @ingroup ge
  644. /// @brief definiteness queue schedule, wait for end graph.
  645. /// @return: 0 for success / others for fail
  646. ///
  647. Status CpuWaitEndGraph();
  648. Status BindEnqueue();
  649. Status CpuModelEnqueue(uint32_t queue_id, uintptr_t out_mbuf);
  650. ///
  651. /// @ingroup ge
  652. /// @brief definiteness queue schedule, repeat run model.
  653. /// @return: 0 for success / others for fail
  654. ///
  655. Status CpuModelRepeat();
  656. Status InitEntryTask();
  657. Status AddHeadStream();
  658. ///
  659. /// @ingroup ge
  660. /// @brief set ts device.
  661. /// @return: 0 for success / others for fail
  662. ///
  663. Status SetTSDevice();
  664. Status OpDebugRegister();
  665. void OpDebugUnRegister();
  666. void CheckHasHcomOp();
  667. Status DoTaskSink();
  668. void CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputDescInfo &output, uint32_t &format_result);
  669. Status TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id);
  670. // get desc info of graph for profiling
  671. Status GetComputeGraphInfo(vector<ComputeGraphDescInfo> &graph_desc_info);
  672. void SetDataDumperArgs(const ComputeGraphPtr &compute_graph);
  673. Status InitModelProfile();
  674. Status SinkModelProfile();
  675. Status SinkTimeProfile(const InputData &current_data);
  676. Status GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data_index, OutputData *output_data,
  677. std::vector<ge::OutputTensorInfo> &outputs);
  678. void ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_info);
  679. void SetLabelForDynamic(const NodePtr &node);
  680. void ParseDynamicOutShape(const std::vector<std::string> &str_info, std::vector<vector<int64_t>> &vec_info);
  681. bool IsGetNextSinkDynamic(const OpDescPtr &op_desc);
  682. void GetAllGearsInfo(const NodePtr &node);
  683. Status GetGetDynamicDimsNodeInfo(const NodePtr &node);
  684. Status GetGearAndRealOutSizeInfo(size_t input_count, const NodePtr &node);
  685. Status GetRealOutputSizeOfMerge(size_t input_index, const NodePtr &merge_node);
  686. Status GetGearAndRealOutShapeInfo(size_t input_count, const OpDescPtr &op_desc);
  687. bool is_weight_mem_has_inited_;
  688. bool is_feature_map_mem_has_inited_;
  689. uint32_t model_id_;
  690. uint32_t runtime_model_id_;
  691. string name_;
  692. // used for inference data dump
  693. string om_name_;
  694. uint32_t version_;
  695. GeModelPtr ge_model_;
  696. bool need_destroy_aicpu_kernel_{false};
  697. vector<std::string> out_node_name_;
  698. map<uint32_t, OpDescPtr> op_list_;
  699. // data op_desc
  700. vector<OpDescPtr> data_op_list_;
  701. vector<OpDescPtr> output_op_list_;
  702. vector<OpDescPtr> variable_op_list_;
  703. std::map<uint32_t, ZeroCopyOffset> new_input_data_info_;
  704. std::map<uint32_t, ZeroCopyOffset> new_output_data_info_;
  705. std::map<const void *, ZeroCopyOffset> new_input_outside_addrs_;
  706. std::map<const void *, ZeroCopyOffset> new_output_outside_addrs_;
  707. std::set<const void *> real_virtual_addrs_;
  708. // output op: save cce op actual needed memory size
  709. vector<int64_t> output_memory_size_list_;
  710. std::thread thread_id_;
  711. std::shared_ptr<ModelListener> listener_;
  712. bool run_flg_;
  713. std::mutex mux_run_flg_;
  714. int32_t priority_;
  715. vector<rtStream_t> stream_list_;
  716. std::mutex all_hccl_stream_list_mutex_;
  717. vector<rtStream_t> all_hccl_stream_list_;
  718. // for reuse hccl_follow_stream
  719. std::mutex capacity_of_stream_mutex_;
  720. std::map<int64_t, std::vector<rtStream_t>> main_follow_stream_mapping_;
  721. vector<rtEvent_t> event_list_;
  722. vector<rtLabel_t> label_list_;
  723. set<uint32_t> label_id_indication_;
  724. std::mutex outside_addrs_mutex_;
  725. std::vector<ZeroCopyTask> zero_copy_tasks_; // Task used Data or NetOutput addr.
  726. std::set<const void *> copy_only_addrs_; // Address need copy to original place.
  727. std::vector<TaskInfoPtr> task_list_;
  728. // rt_moodel_handle
  729. rtModel_t rt_model_handle_;
  730. rtStream_t rt_model_stream_;
  731. bool is_inner_model_stream_;
  732. bool is_async_mode_; // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_.
  733. ExecuteMode last_execute_mode_;
  734. bool is_stream_list_bind_{false};
  735. bool is_pure_head_stream_{false};
  736. rtStream_t rt_head_stream_{nullptr};
  737. rtStream_t rt_entry_stream_{nullptr};
  738. rtAicpuDeployType_t deploy_type_{AICPU_DEPLOY_RESERVED};
  739. // ACL queue schedule, save queue ids for Init.
  740. std::vector<TaskInfoPtr> cpu_task_list_;
  741. std::vector<uint32_t> input_queue_ids_; // input queue ids created by caller.
  742. std::vector<uint32_t> output_queue_ids_; // output queue ids created by caller.
  743. std::vector<uintptr_t> input_mbuf_list_; // input mbuf created by dequeue task.
  744. std::vector<uintptr_t> output_mbuf_list_; // output mbuf created by dequeue task.
  745. uint64_t session_id_;
  746. uint32_t device_id_;
  747. std::mutex flowctrl_op_index_internal_map_mutex_;
  748. std::map<uint32_t, uint32_t> flowctrl_op_index_internal_map_;
  749. std::vector<rtStream_t> active_stream_list_;
  750. std::set<uint32_t> active_stream_indication_;
  751. std::set<uint32_t> hcom_streams_;
  752. RuntimeParam runtime_param_;
  753. static std::mutex tvm_bin_mutex_;
  754. std::set<std::string> tvm_bin_kernel_;
  755. std::map<std::string, uint32_t> used_tbe_handle_map_;
  756. // for profiling task and graph info
  757. std::vector<TaskDescInfo> task_desc_info_;
  758. int64_t maxDumpOpNum_;
  759. // for data dump
  760. DataDumper data_dumper_;
  761. uint64_t iterator_count_;
  762. bool is_l1_fusion_enable_;
  763. std::map<OpDescPtr, void *> saved_task_addrs_;
  764. void *l1_fusion_addr_ = nullptr;
  765. bool known_node_ = false;
  766. uint32_t total_args_size_ = 0;
  767. void *args_ = nullptr;
  768. void *args_host_ = nullptr;
  769. void *fixed_addrs_ = nullptr;
  770. void *hybrid_addrs_ = nullptr;
  771. uint32_t total_hybrid_args_size_ = 0;
  772. int64_t total_fixed_addr_size_ = 0;
  773. std::map<const void *, void *> knonw_input_data_info_;
  774. std::map<const void *, void *> knonw_output_data_info_;
  775. vector<void *> total_io_addrs_;
  776. vector<void *> orig_total_io_addrs_;
  777. bool base_addr_not_changed_ = false;
  778. vector<vector<int64_t>> batch_info_;
  779. std::vector<std::vector<int64_t>> combined_batch_info_;
  780. vector<string> user_designate_shape_order_;
  781. int32_t dynamic_type_ = 0;
  782. bool is_dynamic_ = false;
  783. vector<uint64_t> batch_size_;
  784. // key: input tensor name, generally rts op;
  785. // value: the fixed addr of input anchor, same as the peer output anchor addr of the peer op
  786. std::map<string, int64_t> tensor_name_to_fixed_addr_size_;
  787. // key: input tensor name, generally rts op; value: the peer output anchor of the peer op
  788. std::map<string, int64_t> tensor_name_to_peer_output_index_;
  789. // if model is first execute
  790. bool is_first_execute_;
  791. // for op debug
  792. std::mutex debug_reg_mutex_;
  793. bool is_op_debug_reg_ = false;
  794. void *op_debug_addr_ = nullptr;
  795. void *p2p_debug_addr_ = nullptr;
  796. bool is_new_model_desc_{false};
  797. bool is_online_infer_dynamic_ = false;
  798. bool is_getnext_sink_dynamic_ = false;
  799. std::vector<int64_t> cur_dynamic_dims_;
  800. void *netoutput_last_input_addr_ = nullptr;
  801. int64_t netoutput_last_input_size_ = 0;
  802. size_t shape_of_cur_dynamic_dims_ = 0;
  803. // key: input_index: input is merge node; value: each gear info and each output size
  804. std::map<size_t, std::map<vector<int64_t>, int64_t>> merge_nodes_gear_and_real_out_size_info_;
  805. // key: input_index: input is merge node; value: each gear info and each output shape
  806. std::map<size_t, std::map<vector<int64_t>, vector<int64_t>>> merge_nodes_gear_and_real_out_shape_info_;
  807. std::vector<std::vector<int64_t>> all_gears_info_;
  808. std::multimap<uint32_t, uint32_t> op_id_map_;
  809. std::vector<ProfileInfo> profile_list_;
  810. // For super kernel.
  811. SuperKernelTaskInfo skt_info_;
  812. };
  813. } // namespace ge
  814. #endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示