You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rt_ffts.h 7.3 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef CCE_RUNTIME_RT_FFTS_H
  17. #define CCE_RUNTIME_RT_FFTS_H
  18. #include "base.h"
  19. #if defined(__cplusplus)
  20. extern "C" {
  21. #endif
  22. #define RT_FFTS_MAX_SUB_TASK_NUM 32U
  23. #define RT_FFTS_MAX_TICKET_CACHE_NUM 64U
  24. #define RT_FFTS_MAX_MANUAL_THREAD_NUM 16U
  25. #define RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK 8U
  26. #define RT_FFTS_MANUAL_SRC_DEPEND_TBL_LEN 32U
  27. typedef enum tagFftsType {
  28. RT_FFTS_TYPE_AUTO_THREAD = 2, // ffts auto thread mode, same as ffts define
  29. RT_FFTS_TYPE_MANUAL_THREAD = 3, // ffts manual thread mode, same as ffts define
  30. } rtFftsType_t;
  31. typedef enum tagFftsSubTaskType {
  32. RT_FFTS_SUB_TASK_TYPE_AIC = 0,
  33. RT_FFTS_SUB_TASK_TYPE_AIV = 1,
  34. RT_FFTS_SUB_TASK_TYPE_NOP = 2,
  35. RT_FFTS_SUB_TASK_TYPE_NOTIFY_WAIT = 3,
  36. RT_FFTS_SUB_TASK_TYPE_NOTIFY_RECORD = 4,
  37. RT_FFTS_SUB_TASK_TYPE_WRITE_VALUE = 5,
  38. RT_FFTS_SUB_TASK_TYPE_MIX_AIC = 6,
  39. RT_FFTS_SUB_TASK_TYPE_MIX_AIV = 7,
  40. RT_FFTS_SUB_TASK_TYPE_SDMA = 8,
  41. RT_FFTS_SUB_TASK_TYPE_RESERVED = 9,
  42. } rtFftsSubTaskType_t;
  43. typedef struct tagManualThreadDmuInfo {
  44. uint64_t dataAddr; // device mem
  45. uint16_t numOuter;
  46. uint16_t numInner;
  47. uint32_t strideOuter;
  48. uint32_t lenInner;
  49. uint32_t strideInner;
  50. } rtManualThreadDmuInfo_t;
  51. typedef struct tagManualThreadDependency {
  52. uint8_t dependency[RT_FFTS_MANUAL_SRC_DEPEND_TBL_LEN];
  53. } rtManualThreadDependency_t;
  54. typedef struct tagManualThreadAicAivInfo {
  55. uint64_t taskParamAddr; // device mem
  56. uint16_t taskParamOffset;
  57. // when satMode=1 and FP16 computation with none INF inputs overflows/underflows, results will be +/-INF of FP16
  58. // when satMode=0 and FP16 computation with none INF inputs overflows/underflows,
  59. // results will be saturated to +/-MAX of FP16
  60. uint8_t satMode;
  61. uint8_t scheduleMode; // 0:normal mode, 1:batch mode, 2:sync mode 3:reserved
  62. uint8_t iCachePrefetchCnt; // units is 2K
  63. uint8_t prefetchEnableBitmap; // 8 bit bitmap 1 0 1 0
  64. uint8_t prefetchOnceBitmap; // 8 bit bitmap 1 0 1 0
  65. uint16_t prefetchOnceDmuNum; // prefetch_once_dmu_descriptor_index in ffts
  66. // num: thread0_prefetch_dmu_descriptor_index – prefetch_once_dmu_descriptor_index
  67. uint16_t threadPrefetchDmuIdx[RT_FFTS_MAX_MANUAL_THREAD_NUM]; // max valid is threadDim
  68. uint16_t threadBlkDim[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  69. const char_t *threadTaskFuncStub[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  70. rtManualThreadDmuInfo_t *prefetchList; // dmu desc 0-64k, length is the last threadPrefetchDmuIdx[threadDim-1]
  71. rtManualThreadDependency_t srcDepTbl[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  72. } rtManualThreadAicAivInfo_t;
  73. typedef struct tagAutoThreadPrefetch {
  74. uint64_t dataAddr; // device mem
  75. uint32_t dataAddrOffset;
  76. uint32_t nonTailDataLen;
  77. uint32_t tailDataLen;
  78. } rtAutoThreadPrefetch_t;
  79. typedef struct tagAutoThreadAicAivInfo {
  80. uint64_t taskParamAddr; // device mem
  81. uint16_t taskParamOffset;
  82. // when satMode=1 and FP16 computation with none INF inputs overflows/underflows, results will be +/-INF of FP16
  83. // when satMode=0 and FP16 computation with none INF inputs overflows/underflows, results will be saturated to +/-MAX of FP16
  84. uint8_t satMode;
  85. uint8_t scheduleMode; // 0:normal mode, 1:batch mode, 2:sync mode 3:reserved
  86. uint8_t iCachePrefetchCnt; // units is 2K
  87. uint8_t prefetchEnableBitmap; // 8 bit bitmap
  88. uint8_t prefetchOnceBitmap; // 8 bit bitmap
  89. uint16_t tailBlkDim;
  90. uint16_t nonTailBlkDim;
  91. const char_t *nonTailTaskFuncStub;
  92. const char_t *tailTaskFuncStub;
  93. // for prefetch, valid num is prefetchEnableBitmap bit count.
  94. // if prefetchEnableBitmap='00010011', need prefetch number is 3, srcPrefetch is only 0, 1, 2 is valid
  95. rtAutoThreadPrefetch_t srcPrefetch[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  96. } rtAutoThreadAicAivInfo_t;
  97. typedef struct tagAutoThreadCacheInfo {
  98. uint64_t dataAddr; // device mem
  99. uint32_t dataAddrOffset;
  100. uint32_t nonTailDataLen;
  101. uint32_t tailDataLen;
  102. uint16_t ticketCacheRefCnt;
  103. } rtAutoThreadCacheInfo_t;
  104. typedef struct tagManualThreadCacheInfo {
  105. rtManualThreadDmuInfo_t *dmuList; // 0-64k
  106. uint16_t dmuNum;
  107. uint16_t sliceDmuIdx[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  108. uint16_t ticketCacheRefCntTbl[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  109. } rtManualThreadCacheInfo_t;
  110. typedef enum tagCacheOp {
  111. RT_CACHE_OP_NONE = 0,
  112. RT_CACHE_OP_FLUSH = 1,
  113. RT_CACHE_OP_INVALIDATE = 2,
  114. RT_CACHE_OP_WRITE_BACK = 3,
  115. } rtCacheOp_t;
  116. typedef struct tagTicketCache {
  117. rtCacheOp_t cacheOption;
  118. uint8_t ticketCacheWindow;
  119. union {
  120. rtAutoThreadCacheInfo_t autoThreadCache;
  121. rtManualThreadCacheInfo_t manualThreadCache;
  122. } custom;
  123. } rtTicketCache_t;
  124. typedef struct tagManualThreadNopInfo {
  125. // depend srcTickCacheVldBitmap in rtFftsSubTaskInfo_t
  126. rtManualThreadDependency_t srcDepTbl[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  127. } rtManualThreadNopInfo_t;
  128. typedef struct tagFftsSubTaskInfo {
  129. rtFftsSubTaskType_t subTaskType;
  130. uint16_t threadDim;
  131. uint8_t dstTickCacheVldBitmap;
  132. uint8_t srcTickCacheVldBitmap;
  133. uint8_t srcDataOutOfSubGraphBitmap;
  134. uint8_t dstTickCacheID[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  135. uint8_t srcTickCacheID[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  136. union {
  137. rtAutoThreadAicAivInfo_t autoThreadAicAiv;
  138. rtManualThreadAicAivInfo_t manualThreadAicAiv;
  139. rtManualThreadNopInfo_t manualThreadNop;
  140. } custom;
  141. } rtFftsSubTaskInfo_t;
  142. typedef struct tagFftsDescInfo {
  143. uint8_t tm; // thread subtask kickstart mode, 0:order, 1:disorder
  144. uint8_t di; // discard invalidate
  145. uint8_t dw; // discard write back
  146. uint8_t df; // discard flush
  147. uint8_t dataSplitUnit; // split source or ticket cache by 2^dataSplitUnit MB
  148. uint8_t prefetchOstNum;
  149. uint8_t cacheMaintainOstNum;
  150. uint8_t aicPrefetchUpper;
  151. uint8_t aicPrefetchLower;
  152. uint8_t aivPrefetchUpper;
  153. uint8_t aivPrefetchLower;
  154. } rtFftsDescInfo_t;
  155. typedef struct tagFftsTaskInfo {
  156. rtFftsType_t fftsType;
  157. uint16_t subTaskNum;
  158. uint16_t tickCacheNum;
  159. rtFftsDescInfo_t fftsDesc;
  160. // sub task desc, real num is subTaskNum
  161. rtFftsSubTaskInfo_t subTask[RT_FFTS_MAX_SUB_TASK_NUM];
  162. // ticket cache, real number is tickCacheNum.
  163. rtTicketCache_t ticketCache[RT_FFTS_MAX_TICKET_CACHE_NUM];
  164. } rtFftsTaskInfo_t;
  165. RTS_API rtError_t rtFftsTaskLaunch(rtFftsTaskInfo_t *fftsTaskInfo, rtStream_t stream);
  166. RTS_API rtError_t rtGetC2cCtrlAddr(uint64_t *addr, uint32_t *len);
  167. RTS_API rtError_t rtFftsTaskLaunchWithFlag(rtFftsTaskInfo_t *fftsTaskInfo, rtStream_t stream, uint32_t flag);
  168. #if defined(__cplusplus)
  169. }
  170. #endif
  171. #endif // CCE_RUNTIME_RT_FFTS_H

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示