You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x4.S 32 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha x17
  49. #define alpha0 d10
  50. #define alphaV0 v10.d[0]
  51. #define A_PRE_SIZE 2560
  52. #define B_PRE_SIZE 448
  53. #define C_PRE_SIZE 128
  54. // 00 origM
  55. // 01 origN
  56. // 02 origK
  57. // 03 origPA
  58. // 04 origPB
  59. // 05 pC
  60. // 06 origLDC -> LDC
  61. // 07 temp
  62. // 08 counterL
  63. // 09 counterI
  64. // 10 counterJ
  65. // 11 pB
  66. // 12 pCRow0
  67. // 13 pCRow1
  68. // 14 pCRow2
  69. // 15 pCRow3
  70. // 16 pA
  71. // 17
  72. // 18 must save
  73. // 19 must save
  74. // 20 must save
  75. // 21 must save
  76. // 22 must save
  77. // 23 must save
  78. // 24 must save
  79. // 25 must save
  80. // 26 must save
  81. // 27 must save
  82. // 28 must save
  83. // 29 frame
  84. // 30 link
  85. // 31 sp
  86. //v00 ALPHA -> pA0_0, pA0_1
  87. //v01 pA0_2, pA0_3
  88. //v02 pA0_4, pA0_5
  89. //v03 pA0_6, pA0_7
  90. //v04 pA1_0, pA1_1
  91. //v05 pA1_2, pA1_3
  92. //v06 pA1_4, pA1_5
  93. //v07 pA1_6, pA1_7
  94. //v08 must save pB0_0
  95. //v09 must save pB0_1
  96. //v10 must save pB0_2 --> ALPHA0
  97. //v11 must save pB0_3
  98. //v12 must save pB1_0
  99. //v13 must save pB1_1
  100. //v14 must save pB1_2
  101. //v15 must save pB1_3
  102. //v16 must save C00, C01
  103. //v17 must save C02, C03
  104. //v18 C04, C05
  105. //v19 C06, C07
  106. //v20 C10, C11
  107. //v21 C12, C13
  108. //v22 C14, C15
  109. //v23 C16, C17
  110. //v24 C20, C21
  111. //v25 C22, C23
  112. //v26 C24, C25
  113. //v27 C26, C27
  114. //v28 C30, C31
  115. //v29 C32, C33
  116. //v30 C34, C35
  117. //v31 C36, C37
  118. /*******************************************************************************
  119. * Macro definitions
  120. *******************************************************************************/
  121. .macro INIT8x4
  122. fmov d16, xzr
  123. fmov d17, xzr
  124. fmov d18, d16
  125. fmov d19, xzr
  126. fmov d20, xzr
  127. fmov d21, d16
  128. fmov d22, d17
  129. fmov d23, d18
  130. fmov d24, xzr
  131. fmov d25, d16
  132. fmov d26, d17
  133. fmov d27, d18
  134. fmov d28, xzr
  135. fmov d29, d16
  136. fmov d30, d17
  137. fmov d31, d18
  138. .endm
  139. .macro KERNEL8x4_I
  140. ldp q0, q1, [pA], #32
  141. ldp d8, d9, [pB], #16
  142. fmul v16.2d, v0.2d, v8.d[0]
  143. fmul v20.2d, v0.2d, v9.d[0]
  144. ldp d10, d11, [pB], #16
  145. fmul v17.2d, v1.2d, v8.d[0]
  146. fmul v21.2d, v1.2d, v9.d[0]
  147. ldp q2, q3, [pA], #32
  148. fmul v24.2d, v0.2d, v10.d[0]
  149. fmul v28.2d, v0.2d, v11.d[0]
  150. ldp q4, q5, [pA], #32
  151. fmul v25.2d, v1.2d, v10.d[0]
  152. fmul v29.2d, v1.2d, v11.d[0]
  153. ldp d12, d13, [pB], #16
  154. fmul v18.2d, v2.2d, v8.d[0]
  155. fmul v22.2d, v2.2d, v9.d[0]
  156. ldp d14, d15, [pB], #16
  157. fmul v26.2d, v2.2d, v10.d[0]
  158. fmul v30.2d, v2.2d, v11.d[0]
  159. ldp q6, q7, [pA], #32
  160. fmul v19.2d, v3.2d, v8.d[0]
  161. fmul v27.2d, v3.2d, v10.d[0]
  162. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  163. fmul v31.2d, v3.2d, v11.d[0]
  164. fmul v23.2d, v3.2d, v9.d[0]
  165. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  166. .endm
  167. .macro KERNEL8x4_M1
  168. fmla v16.2d, v0.2d, v8.d[0]
  169. fmla v20.2d, v0.2d, v9.d[0]
  170. ldp q4, q5, [pA], #32
  171. fmla v24.2d, v0.2d, v10.d[0]
  172. fmla v28.2d, v0.2d, v11.d[0]
  173. ldp d12, d13, [pB], #16
  174. fmla v17.2d, v1.2d, v8.d[0]
  175. fmla v25.2d, v1.2d, v10.d[0]
  176. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  177. fmla v21.2d, v1.2d, v9.d[0]
  178. fmla v29.2d, v1.2d, v11.d[0]
  179. ldp d14, d15, [pB], #16
  180. fmla v18.2d, v2.2d, v8.d[0]
  181. fmla v22.2d, v2.2d, v9.d[0]
  182. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  183. fmla v26.2d, v2.2d, v10.d[0]
  184. fmla v30.2d, v2.2d, v11.d[0]
  185. fmla v19.2d, v3.2d, v8.d[0]
  186. fmla v23.2d, v3.2d, v9.d[0]
  187. ldp q6, q7, [pA], #32
  188. fmla v27.2d, v3.2d, v10.d[0]
  189. fmla v31.2d, v3.2d, v11.d[0]
  190. .endm
  191. .macro KERNEL8x4_M2
  192. fmla v16.2d, v4.2d, v12.d[0]
  193. fmla v20.2d, v4.2d, v13.d[0]
  194. fmla v24.2d, v4.2d, v14.d[0]
  195. fmla v28.2d, v4.2d, v15.d[0]
  196. ldp q0, q1, [pA], #32
  197. fmla v17.2d, v5.2d, v12.d[0]
  198. fmla v25.2d, v5.2d, v14.d[0]
  199. ldp d8, d9, [pB], #16
  200. fmla v21.2d, v5.2d, v13.d[0]
  201. fmla v29.2d, v5.2d, v15.d[0]
  202. ldp d10, d11, [pB], #16
  203. fmla v18.2d, v6.2d, v12.d[0]
  204. fmla v22.2d, v6.2d, v13.d[0]
  205. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  206. fmla v26.2d, v6.2d, v14.d[0]
  207. fmla v30.2d, v6.2d, v15.d[0]
  208. fmla v19.2d, v7.2d, v12.d[0]
  209. fmla v23.2d, v7.2d, v13.d[0]
  210. ldp q2, q3, [pA], #32
  211. fmla v27.2d, v7.2d, v14.d[0]
  212. fmla v31.2d, v7.2d, v15.d[0]
  213. .endm
  214. .macro KERNEL8x4_E
  215. fmla v16.2d, v4.2d, v12.d[0]
  216. fmla v20.2d, v4.2d, v13.d[0]
  217. fmla v24.2d, v4.2d, v14.d[0]
  218. fmla v28.2d, v4.2d, v15.d[0]
  219. fmla v17.2d, v5.2d, v12.d[0]
  220. fmla v25.2d, v5.2d, v14.d[0]
  221. fmla v21.2d, v5.2d, v13.d[0]
  222. fmla v29.2d, v5.2d, v15.d[0]
  223. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  224. fmla v18.2d, v6.2d, v12.d[0]
  225. fmla v22.2d, v6.2d, v13.d[0]
  226. fmla v26.2d, v6.2d, v14.d[0]
  227. fmla v30.2d, v6.2d, v15.d[0]
  228. fmla v19.2d, v7.2d, v12.d[0]
  229. fmla v23.2d, v7.2d, v13.d[0]
  230. fmla v27.2d, v7.2d, v14.d[0]
  231. fmla v31.2d, v7.2d, v15.d[0]
  232. .endm
  233. .macro KERNEL8x4_SUB
  234. ldp q0, q1, [pA], #32
  235. ldp d8, d9, [pB], #16
  236. fmla v16.2d, v0.2d, v8.d[0]
  237. fmla v20.2d, v0.2d, v9.d[0]
  238. ldp d10, d11, [pB], #16
  239. fmla v17.2d, v1.2d, v8.d[0]
  240. fmla v21.2d, v1.2d, v9.d[0]
  241. ldp q2, q3, [pA], #32
  242. fmla v24.2d, v0.2d, v10.d[0]
  243. fmla v28.2d, v0.2d, v11.d[0]
  244. fmla v25.2d, v1.2d, v10.d[0]
  245. fmla v29.2d, v1.2d, v11.d[0]
  246. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  247. fmla v18.2d, v2.2d, v8.d[0]
  248. fmla v22.2d, v2.2d, v9.d[0]
  249. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  250. fmla v26.2d, v2.2d, v10.d[0]
  251. fmla v30.2d, v2.2d, v11.d[0]
  252. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  253. fmla v19.2d, v3.2d, v8.d[0]
  254. fmla v27.2d, v3.2d, v10.d[0]
  255. fmla v31.2d, v3.2d, v11.d[0]
  256. fmla v23.2d, v3.2d, v9.d[0]
  257. .endm
  258. .macro SAVE8x4
  259. fmov alpha0, alpha
  260. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  261. ldp q0, q1, [pCRow0]
  262. fmla v0.2d, v16.2d, alphaV0
  263. fmla v1.2d, v17.2d, alphaV0
  264. stp q0, q1, [pCRow0]
  265. add pCRow0, pCRow0, #32
  266. ldp q2, q3, [pCRow0]
  267. fmla v2.2d, v18.2d, alphaV0
  268. fmla v3.2d, v19.2d, alphaV0
  269. stp q2, q3, [pCRow0]
  270. add pCRow0, pCRow0, #32
  271. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  272. ldp q4, q5, [pCRow1]
  273. fmla v4.2d, v20.2d, alphaV0
  274. fmla v5.2d, v21.2d, alphaV0
  275. stp q4, q5, [pCRow1]
  276. add pCRow1, pCRow1, #32
  277. ldp q6, q7, [pCRow1]
  278. fmla v6.2d, v22.2d, alphaV0
  279. fmla v7.2d, v23.2d, alphaV0
  280. stp q6, q7, [pCRow1]
  281. add pCRow1, pCRow1, #32
  282. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  283. ldp q0, q1, [pCRow2]
  284. fmla v0.2d, v24.2d, alphaV0
  285. fmla v1.2d, v25.2d, alphaV0
  286. stp q0, q1, [pCRow2]
  287. add pCRow2, pCRow2, #32
  288. ldp q2, q3, [pCRow2]
  289. fmla v2.2d, v26.2d, alphaV0
  290. fmla v3.2d, v27.2d, alphaV0
  291. stp q2, q3, [pCRow2]
  292. add pCRow2, pCRow2, #32
  293. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  294. ldp q4, q5, [pCRow3]
  295. fmla v4.2d, v28.2d, alphaV0
  296. fmla v5.2d, v29.2d, alphaV0
  297. stp q4, q5, [pCRow3]
  298. add pCRow3, pCRow3, #32
  299. ldp q6, q7, [pCRow3]
  300. fmla v6.2d, v30.2d, alphaV0
  301. fmla v7.2d, v31.2d, alphaV0
  302. stp q6, q7, [pCRow3]
  303. add pCRow3, pCRow3, #32
  304. .endm
  305. /******************************************************************************/
  306. .macro INIT4x4
  307. fmov d16, xzr
  308. fmov d17, d16
  309. fmov d20, d17
  310. fmov d21, d16
  311. fmov d24, d17
  312. fmov d25, d16
  313. fmov d28, d17
  314. fmov d29, d16
  315. .endm
  316. .macro KERNEL4x4_SUB
  317. ld1 {v8.2d, v9.2d}, [pB]
  318. add pB, pB, #32
  319. ld1 {v0.2d, v1.2d}, [pA]
  320. add pA, pA, #32
  321. fmla v16.2d, v0.2d, v8.d[0]
  322. fmla v29.2d, v1.2d, v9.d[1]
  323. fmla v20.2d, v0.2d, v8.d[1]
  324. fmla v25.2d, v1.2d, v9.d[0]
  325. fmla v24.2d, v0.2d, v9.d[0]
  326. fmla v21.2d, v1.2d, v8.d[1]
  327. fmla v28.2d, v0.2d, v9.d[1]
  328. fmla v17.2d, v1.2d, v8.d[0]
  329. .endm
  330. .macro SAVE4x4
  331. fmov alpha0, alpha
  332. ld1 {v8.2d, v9.2d}, [pCRow0]
  333. fmla v8.2d, v16.2d, alphaV0
  334. fmla v9.2d, v17.2d, alphaV0
  335. st1 {v8.2d, v9.2d}, [pCRow0]
  336. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  337. add pCRow0, pCRow0, #32
  338. ld1 {v12.2d, v13.2d}, [pCRow1]
  339. fmla v12.2d, v20.2d, alphaV0
  340. fmla v13.2d, v21.2d, alphaV0
  341. st1 {v12.2d, v13.2d}, [pCRow1]
  342. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  343. add pCRow1, pCRow1, #32
  344. ld1 {v8.2d, v9.2d}, [pCRow2]
  345. fmla v8.2d, v24.2d, alphaV0
  346. fmla v9.2d, v25.2d, alphaV0
  347. st1 {v8.2d, v9.2d}, [pCRow2]
  348. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  349. add pCRow2, pCRow2, #32
  350. ld1 {v12.2d, v13.2d}, [pCRow3]
  351. fmla v12.2d, v28.2d, alphaV0
  352. fmla v13.2d, v29.2d, alphaV0
  353. st1 {v12.2d, v13.2d}, [pCRow3]
  354. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  355. add pCRow3, pCRow3, #32
  356. .endm
  357. /******************************************************************************/
  358. .macro INIT2x4
  359. fmov d16, xzr
  360. fmov d20, d16
  361. fmov d24, d20
  362. fmov d28, d16
  363. .endm
  364. .macro KERNEL2x4_SUB
  365. ld1 {v8.2d, v9.2d}, [pB]
  366. add pB, pB, #32
  367. ld1 {v0.2d}, [pA]
  368. add pA, pA, #16
  369. fmla v16.2d, v0.2d, v8.d[0]
  370. fmla v20.2d, v0.2d, v8.d[1]
  371. fmla v24.2d, v0.2d, v9.d[0]
  372. fmla v28.2d, v0.2d, v9.d[1]
  373. .endm
  374. .macro SAVE2x4
  375. fmov alpha0, alpha
  376. ld1 {v8.2d}, [pCRow0]
  377. fmla v8.2d, v16.2d, alphaV0
  378. st1 {v8.2d}, [pCRow0]
  379. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  380. add pCRow0, pCRow0, #16
  381. ld1 {v12.2d}, [pCRow1]
  382. fmla v12.2d, v20.2d, alphaV0
  383. st1 {v12.2d}, [pCRow1]
  384. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  385. add pCRow1, pCRow1, #16
  386. ld1 {v8.2d}, [pCRow2]
  387. fmla v8.2d, v24.2d, alphaV0
  388. st1 {v8.2d}, [pCRow2]
  389. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  390. add pCRow2, pCRow2, #16
  391. ld1 {v12.2d}, [pCRow3]
  392. fmla v12.2d, v28.2d, alphaV0
  393. st1 {v12.2d}, [pCRow3]
  394. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  395. add pCRow3, pCRow3, #16
  396. .endm
  397. /******************************************************************************/
  398. .macro INIT1x4
  399. fmov d16, xzr
  400. fmov d20, d16
  401. .endm
  402. .macro KERNEL1x4_SUB
  403. ldr d0, [pA]
  404. add pA, pA, #8
  405. ld1 {v8.2d, v9.2d}, [pB]
  406. add pB, pB, #32
  407. fmla v16.2d, v8.2d, v0.d[0]
  408. fmla v20.2d, v9.2d, v0.d[0]
  409. .endm
  410. .macro SAVE1x4
  411. fmov alpha0, alpha
  412. ld1 {v8.d}[0], [pCRow0]
  413. ld1 {v8.d}[1], [pCRow1]
  414. fmla v8.2d, v16.2d, alphaV0
  415. st1 {v8.d}[0], [pCRow0]
  416. st1 {v8.d}[1], [pCRow1]
  417. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  418. add pCRow0, pCRow0, #8
  419. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  420. add pCRow1, pCRow1, #8
  421. ld1 {v12.d}[0], [pCRow2]
  422. ld1 {v12.d}[1], [pCRow3]
  423. fmla v12.2d, v20.2d, alphaV0
  424. st1 {v12.d}[0], [pCRow2]
  425. st1 {v12.d}[1], [pCRow3]
  426. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  427. add pCRow2, pCRow2, #8
  428. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  429. add pCRow3, pCRow3, #8
  430. .endm
  431. /******************************************************************************/
  432. .macro INIT8x2
  433. fmov d16, xzr
  434. fmov d17, xzr
  435. fmov d18, d16
  436. fmov d19, d17
  437. fmov d20, xzr
  438. fmov d21, d16
  439. fmov d22, d17
  440. fmov d23, d18
  441. .endm
  442. .macro KERNEL8x2_SUB
  443. ld1 {v0.2d, v1.2d}, [pA]
  444. add pA, pA, #32
  445. ld1 {v8.2d}, [pB]
  446. add pB, pB, #16
  447. ld1 {v2.2d, v3.2d}, [pA]
  448. add pA, pA, #32
  449. fmla v16.2d, v0.2d, v8.d[0]
  450. fmla v17.2d, v1.2d, v8.d[0]
  451. fmla v18.2d, v2.2d, v8.d[0]
  452. fmla v19.2d, v3.2d, v8.d[0]
  453. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  454. fmla v20.2d, v0.2d, v8.d[1]
  455. fmla v21.2d, v1.2d, v8.d[1]
  456. fmla v22.2d, v2.2d, v8.d[1]
  457. fmla v23.2d, v3.2d, v8.d[1]
  458. .endm
  459. .macro SAVE8x2
  460. fmov alpha0, alpha
  461. ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  462. fmla v0.2d, v16.2d, alphaV0
  463. fmla v1.2d, v17.2d, alphaV0
  464. fmla v2.2d, v18.2d, alphaV0
  465. fmla v3.2d, v19.2d, alphaV0
  466. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  467. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  468. add pCRow0, pCRow0, #64
  469. ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  470. fmla v4.2d, v20.2d, alphaV0
  471. fmla v5.2d, v21.2d, alphaV0
  472. fmla v6.2d, v22.2d, alphaV0
  473. fmla v7.2d, v23.2d, alphaV0
  474. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  475. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  476. add pCRow1, pCRow1, #64
  477. .endm
  478. /******************************************************************************/
  479. .macro INIT4x2
  480. fmov d16, xzr
  481. fmov d17, d16
  482. fmov d20, d17
  483. fmov d21, d16
  484. .endm
  485. .macro KERNEL4x2_SUB
  486. ld1 {v8.2d}, [pB]
  487. add pB, pB, #16
  488. ld1 {v0.2d, v1.2d}, [pA]
  489. add pA, pA, #32
  490. fmla v16.2d, v0.2d, v8.d[0]
  491. fmla v17.2d, v1.2d, v8.d[0]
  492. fmla v20.2d, v0.2d, v8.d[1]
  493. fmla v21.2d, v1.2d, v8.d[1]
  494. .endm
  495. .macro SAVE4x2
  496. fmov alpha0, alpha
  497. ld1 {v8.2d, v9.2d}, [pCRow0]
  498. fmla v8.2d, v16.2d, alphaV0
  499. fmla v9.2d, v17.2d, alphaV0
  500. st1 {v8.2d, v9.2d}, [pCRow0]
  501. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  502. add pCRow0, pCRow0, #32
  503. ld1 {v12.2d, v13.2d}, [pCRow1]
  504. fmla v12.2d, v20.2d, alphaV0
  505. fmla v13.2d, v21.2d, alphaV0
  506. st1 {v12.2d, v13.2d}, [pCRow1]
  507. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  508. add pCRow1, pCRow1, #32
  509. .endm
  510. /******************************************************************************/
  511. .macro INIT2x2
  512. fmov d16, xzr
  513. fmov d20, d16
  514. .endm
  515. .macro KERNEL2x2_SUB
  516. ld1 {v8.2d}, [pB]
  517. add pB, pB, #16
  518. ld1 {v0.2d}, [pA]
  519. add pA, pA, #16
  520. fmla v16.2d, v0.2d, v8.d[0]
  521. fmla v20.2d, v0.2d, v8.d[1]
  522. .endm
  523. .macro SAVE2x2
  524. fmov alpha0, alpha
  525. ld1 {v8.2d}, [pCRow0]
  526. fmla v8.2d, v16.2d, alphaV0
  527. st1 {v8.2d}, [pCRow0]
  528. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  529. add pCRow0, pCRow0, #16
  530. ld1 {v12.2d}, [pCRow1]
  531. fmla v12.2d, v20.2d, alphaV0
  532. st1 {v12.2d}, [pCRow1]
  533. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  534. add pCRow1, pCRow1, #16
  535. .endm
  536. /******************************************************************************/
  537. .macro INIT1x2
  538. fmov d16, xzr
  539. .endm
  540. .macro KERNEL1x2_SUB
  541. ld1 {v8.2d} , [pB]
  542. add pB , pB, #16
  543. ldr d0 , [pA]
  544. add pA, pA, #8
  545. fmla v16.2d, v8.2d, v0.d[0]
  546. .endm
  547. .macro SAVE1x2
  548. fmov alpha0, alpha
  549. ld1 {v8.d}[0], [pCRow0]
  550. ld1 {v8.d}[1], [pCRow1]
  551. fmla v8.2d, v16.2d, alphaV0
  552. st1 {v8.d}[0], [pCRow0]
  553. st1 {v8.d}[1], [pCRow1]
  554. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  555. add pCRow0, pCRow0, #8
  556. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  557. add pCRow1, pCRow1, #8
  558. .endm
  559. /******************************************************************************/
  560. .macro INIT8x1
  561. fmov d16, xzr
  562. fmov d17, xzr
  563. fmov d18, d16
  564. fmov d19, d17
  565. .endm
  566. .macro KERNEL8x1_SUB
  567. ld1 {v0.2d, v1.2d}, [pA]
  568. add pA , pA, #32
  569. ldr d8, [pB]
  570. add pB , pB, #8
  571. ld1 {v2.2d, v3.2d}, [pA]
  572. add pA, pA, #32
  573. fmla v16.2d, v0.2d, v8.d[0]
  574. fmla v17.2d, v1.2d, v8.d[0]
  575. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  576. fmla v18.2d, v2.2d, v8.d[0]
  577. fmla v19.2d, v3.2d, v8.d[0]
  578. .endm
  579. .macro SAVE8x1
  580. fmov alpha0, alpha
  581. ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  582. fmla v0.2d, v16.2d, alphaV0
  583. fmla v1.2d, v17.2d, alphaV0
  584. fmla v2.2d, v18.2d, alphaV0
  585. fmla v3.2d, v19.2d, alphaV0
  586. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  587. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  588. add pCRow0, pCRow0, #64
  589. .endm
  590. /******************************************************************************/
  591. .macro INIT4x1
  592. fmov d16, xzr
  593. fmov d17, d16
  594. .endm
  595. .macro KERNEL4x1_SUB
  596. ldr d8, [pB]
  597. add pB , pB, #8
  598. ld1 {v0.2d, v1.2d}, [pA]
  599. add pA , pA, #32
  600. fmla v16.2d, v0.2d, v8.d[0]
  601. fmla v17.2d, v1.2d, v8.d[0]
  602. .endm
  603. .macro SAVE4x1
  604. fmov alpha0, alpha
  605. ld1 {v8.2d, v9.2d}, [pCRow0]
  606. fmla v8.2d, v16.2d, alphaV0
  607. fmla v9.2d, v17.2d, alphaV0
  608. st1 {v8.2d, v9.2d}, [pCRow0]
  609. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  610. add pCRow0, pCRow0, #32
  611. .endm
  612. /******************************************************************************/
  613. .macro INIT2x1
  614. fmov d16, xzr
  615. .endm
  616. .macro KERNEL2x1_SUB
  617. ldr d8, [pB]
  618. add pB , pB, #8
  619. ld1 {v0.2d}, [pA]
  620. add pA , pA, #16
  621. fmla v16.2d, v0.2d, v8.d[0]
  622. .endm
  623. .macro SAVE2x1
  624. fmov alpha0, alpha
  625. ld1 {v8.2d}, [pCRow0]
  626. fmla v8.2d, v16.2d, alphaV0
  627. st1 {v8.2d}, [pCRow0]
  628. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  629. add pCRow0, pCRow0, #16
  630. .endm
  631. /******************************************************************************/
  632. .macro INIT1x1
  633. fmov d16, xzr
  634. .endm
  635. .macro KERNEL1x1_SUB
  636. ldr d8, [pB]
  637. add pB , pB, #8
  638. ldr d0, [pA]
  639. add pA , pA, #8
  640. fmadd d16, d0, d8, d16
  641. .endm
  642. .macro SAVE1x1
  643. fmov alpha0, alpha
  644. ldr d8, [pCRow0]
  645. fmadd d8, d16, alpha0, d8
  646. str d8, [pCRow0]
  647. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  648. add pCRow0, pCRow0, #8
  649. .endm
  650. /*******************************************************************************
  651. * End of macro definitions
  652. *******************************************************************************/
  653. PROLOGUE
  654. .align 5
  655. add sp, sp, #-(11 * 16)
  656. stp d8, d9, [sp, #(0 * 16)]
  657. stp d10, d11, [sp, #(1 * 16)]
  658. stp d12, d13, [sp, #(2 * 16)]
  659. stp d14, d15, [sp, #(3 * 16)]
  660. stp d16, d17, [sp, #(4 * 16)]
  661. stp x18, x19, [sp, #(5 * 16)]
  662. stp x20, x21, [sp, #(6 * 16)]
  663. stp x22, x23, [sp, #(7 * 16)]
  664. stp x24, x25, [sp, #(8 * 16)]
  665. stp x26, x27, [sp, #(9 * 16)]
  666. str x28, [sp, #(10 * 16)]
  667. prfm PLDL1KEEP, [origPB]
  668. prfm PLDL1KEEP, [origPA]
  669. fmov alpha, d0
  670. lsl LDC, LDC, #3 // ldc = ldc * 8
  671. mov pB, origPB
  672. mov counterJ, origN
  673. asr counterJ, counterJ, #2 // J = J / 4
  674. cmp counterJ, #0
  675. ble .Ldgemm_kernel_L2_BEGIN
  676. /******************************************************************************/
  677. .align 5
  678. .Ldgemm_kernel_L4_BEGIN:
  679. mov pCRow0, pC
  680. add pCRow1, pCRow0, LDC
  681. add pCRow2, pCRow1, LDC
  682. add pCRow3, pCRow2, LDC
  683. add pC, pCRow3, LDC
  684. mov pA, origPA // pA = start of A array
  685. .Ldgemm_kernel_L4_M8_BEGIN:
  686. mov counterI, origM
  687. asr counterI, counterI, #3 // counterI = counterI / 8
  688. cmp counterI, #0
  689. ble .Ldgemm_kernel_L4_M4_BEGIN
  690. .align 5
  691. .Ldgemm_kernel_L4_M8_20:
  692. mov pB, origPB
  693. asr counterL , origK, #3 // L = K / 8
  694. cmp counterL , #2 // is there at least 4 to do?
  695. blt .Ldgemm_kernel_L4_M8_32
  696. KERNEL8x4_I
  697. KERNEL8x4_M2
  698. KERNEL8x4_M1
  699. KERNEL8x4_M2
  700. KERNEL8x4_M1
  701. KERNEL8x4_M2
  702. KERNEL8x4_M1
  703. KERNEL8x4_M2
  704. subs counterL, counterL, #2 // subtract 2
  705. ble .Ldgemm_kernel_L4_M8_22a
  706. .align 5
  707. .Ldgemm_kernel_L4_M8_22:
  708. KERNEL8x4_M1
  709. KERNEL8x4_M2
  710. KERNEL8x4_M1
  711. KERNEL8x4_M2
  712. KERNEL8x4_M1
  713. KERNEL8x4_M2
  714. KERNEL8x4_M1
  715. KERNEL8x4_M2
  716. subs counterL, counterL, #1
  717. bgt .Ldgemm_kernel_L4_M8_22
  718. .align 5
  719. .Ldgemm_kernel_L4_M8_22a:
  720. KERNEL8x4_M1
  721. KERNEL8x4_M2
  722. KERNEL8x4_M1
  723. KERNEL8x4_M2
  724. KERNEL8x4_M1
  725. KERNEL8x4_M2
  726. KERNEL8x4_M1
  727. KERNEL8x4_E
  728. b .Ldgemm_kernel_L4_M8_44
  729. .align 5
  730. .Ldgemm_kernel_L4_M8_32:
  731. tst counterL, #1
  732. ble .Ldgemm_kernel_L4_M8_40
  733. KERNEL8x4_I
  734. KERNEL8x4_M2
  735. KERNEL8x4_M1
  736. KERNEL8x4_M2
  737. KERNEL8x4_M1
  738. KERNEL8x4_M2
  739. KERNEL8x4_M1
  740. KERNEL8x4_E
  741. b .Ldgemm_kernel_L4_M8_44
  742. .Ldgemm_kernel_L4_M8_40:
  743. INIT8x4
  744. .Ldgemm_kernel_L4_M8_44:
  745. ands counterL , origK, #7
  746. ble .Ldgemm_kernel_L4_M8_100
  747. .align 5
  748. .Ldgemm_kernel_L4_M8_46:
  749. KERNEL8x4_SUB
  750. subs counterL, counterL, #1
  751. bne .Ldgemm_kernel_L4_M8_46
  752. .Ldgemm_kernel_L4_M8_100:
  753. prfm PLDL1KEEP, [pA]
  754. prfm PLDL1KEEP, [pA, #64]
  755. prfm PLDL1KEEP, [origPB]
  756. SAVE8x4
  757. .Ldgemm_kernel_L4_M8_END:
  758. subs counterI, counterI, #1
  759. bne .Ldgemm_kernel_L4_M8_20
  760. .Ldgemm_kernel_L4_M4_BEGIN:
  761. mov counterI, origM
  762. tst counterI , #7
  763. ble .Ldgemm_kernel_L4_END
  764. tst counterI, #4
  765. ble .Ldgemm_kernel_L4_M2_BEGIN
  766. .Ldgemm_kernel_L4_M4_20:
  767. INIT4x4
  768. mov pB, origPB
  769. asr counterL , origK, #3 // counterL = counterL / 8
  770. cmp counterL , #0
  771. ble .Ldgemm_kernel_L4_M4_40
  772. .align 5
  773. .Ldgemm_kernel_L4_M4_22:
  774. KERNEL4x4_SUB
  775. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  776. KERNEL4x4_SUB
  777. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  778. KERNEL4x4_SUB
  779. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  780. KERNEL4x4_SUB
  781. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  782. KERNEL4x4_SUB
  783. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  784. KERNEL4x4_SUB
  785. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  786. KERNEL4x4_SUB
  787. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  788. KERNEL4x4_SUB
  789. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  790. subs counterL, counterL, #1
  791. bgt .Ldgemm_kernel_L4_M4_22
  792. .Ldgemm_kernel_L4_M4_40:
  793. ands counterL , origK, #7 // counterL = counterL % 8
  794. ble .Ldgemm_kernel_L4_M4_100
  795. .Ldgemm_kernel_L4_M4_42:
  796. KERNEL4x4_SUB
  797. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  798. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  799. subs counterL, counterL, #1
  800. bgt .Ldgemm_kernel_L4_M4_42
  801. .Ldgemm_kernel_L4_M4_100:
  802. SAVE4x4
  803. .Ldgemm_kernel_L4_M4_END:
  804. .Ldgemm_kernel_L4_M2_BEGIN:
  805. mov counterI, origM
  806. tst counterI , #3
  807. ble .Ldgemm_kernel_L4_END
  808. tst counterI, #2 // counterI = counterI / 2
  809. ble .Ldgemm_kernel_L4_M1_BEGIN
  810. .Ldgemm_kernel_L4_M2_20:
  811. INIT2x4
  812. mov pB, origPB
  813. asr counterL , origK, #3 // counterL = counterL / 8
  814. cmp counterL , #0
  815. ble .Ldgemm_kernel_L4_M2_40
  816. .align 5
  817. .Ldgemm_kernel_L4_M2_22:
  818. KERNEL2x4_SUB
  819. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  820. KERNEL2x4_SUB
  821. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  822. KERNEL2x4_SUB
  823. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  824. KERNEL2x4_SUB
  825. KERNEL2x4_SUB
  826. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  827. KERNEL2x4_SUB
  828. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  829. KERNEL2x4_SUB
  830. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  831. KERNEL2x4_SUB
  832. subs counterL, counterL, #1
  833. bgt .Ldgemm_kernel_L4_M2_22
  834. .Ldgemm_kernel_L4_M2_40:
  835. ands counterL , origK, #7 // counterL = counterL % 8
  836. ble .Ldgemm_kernel_L4_M2_100
  837. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  838. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  839. .Ldgemm_kernel_L4_M2_42:
  840. KERNEL2x4_SUB
  841. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  842. subs counterL, counterL, #1
  843. bgt .Ldgemm_kernel_L4_M2_42
  844. .Ldgemm_kernel_L4_M2_100:
  845. SAVE2x4
  846. .Ldgemm_kernel_L4_M2_END:
  847. .Ldgemm_kernel_L4_M1_BEGIN:
  848. tst counterI, #1 // counterI = counterI % 2
  849. ble .Ldgemm_kernel_L4_END
  850. .Ldgemm_kernel_L4_M1_20:
  851. INIT1x4
  852. mov pB, origPB
  853. asr counterL , origK, #3 // counterL = counterL / 8
  854. cmp counterL , #0
  855. ble .Ldgemm_kernel_L4_M1_40
  856. .align 5
  857. .Ldgemm_kernel_L4_M1_22:
  858. KERNEL1x4_SUB
  859. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  860. KERNEL1x4_SUB
  861. KERNEL1x4_SUB
  862. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  863. KERNEL1x4_SUB
  864. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  865. KERNEL1x4_SUB
  866. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  867. KERNEL1x4_SUB
  868. KERNEL1x4_SUB
  869. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  870. KERNEL1x4_SUB
  871. subs counterL, counterL, #1
  872. bgt .Ldgemm_kernel_L4_M1_22
  873. .Ldgemm_kernel_L4_M1_40:
  874. ands counterL , origK, #7 // counterL = counterL % 8
  875. ble .Ldgemm_kernel_L4_M1_100
  876. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  877. .Ldgemm_kernel_L4_M1_42:
  878. KERNEL1x4_SUB
  879. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  880. subs counterL, counterL, #1
  881. bgt .Ldgemm_kernel_L4_M1_42
  882. .Ldgemm_kernel_L4_M1_100:
  883. SAVE1x4
  884. .Ldgemm_kernel_L4_END:
  885. lsl temp, origK, #5
  886. add origPB, origPB, temp // B = B + K * 4 * 8
  887. subs counterJ, counterJ , #1 // j--
  888. bgt .Ldgemm_kernel_L4_BEGIN
  889. /******************************************************************************/
  890. .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  891. mov counterJ , origN
  892. tst counterJ , #3
  893. ble .Ldgemm_kernel_L999 // error, N was less than 4?
  894. tst counterJ , #2
  895. ble .Ldgemm_kernel_L1_BEGIN
  896. mov pCRow0, pC
  897. add pCRow1, pCRow0, LDC
  898. add pC, pCRow1, LDC
  899. mov pA, origPA // pA = A
  900. .Ldgemm_kernel_L2_M8_BEGIN:
  901. mov counterI, origM
  902. asr counterI, counterI, #3 // counterI = counterI / 8
  903. cmp counterI, #0
  904. ble .Ldgemm_kernel_L2_M4_BEGIN
  905. .align 5
  906. .Ldgemm_kernel_L2_M8_20:
  907. INIT8x2
  908. mov pB, origPB
  909. asr counterL , origK, #3 // counterL = counterL / 8
  910. cmp counterL,#0
  911. ble .Ldgemm_kernel_L2_M8_40
  912. .align 5
  913. .Ldgemm_kernel_L2_M8_22:
  914. KERNEL8x2_SUB
  915. KERNEL8x2_SUB
  916. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  917. KERNEL8x2_SUB
  918. KERNEL8x2_SUB
  919. KERNEL8x2_SUB
  920. KERNEL8x2_SUB
  921. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  922. KERNEL8x2_SUB
  923. KERNEL8x2_SUB
  924. subs counterL, counterL, #1
  925. bgt .Ldgemm_kernel_L2_M8_22
  926. .Ldgemm_kernel_L2_M8_40:
  927. ands counterL , origK, #7 // counterL = counterL % 8
  928. ble .Ldgemm_kernel_L2_M8_100
  929. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  930. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  931. .Ldgemm_kernel_L2_M8_42:
  932. KERNEL8x2_SUB
  933. subs counterL, counterL, #1
  934. bgt .Ldgemm_kernel_L2_M8_42
  935. .Ldgemm_kernel_L2_M8_100:
  936. SAVE8x2
  937. .Ldgemm_kernel_L2_M8_END:
  938. subs counterI, counterI, #1
  939. bgt .Ldgemm_kernel_L2_M8_20
  940. .Ldgemm_kernel_L2_M4_BEGIN:
  941. mov counterI, origM
  942. tst counterI , #7
  943. ble .Ldgemm_kernel_L2_END
  944. tst counterI, #4 // counterI = counterI / 2
  945. ble .Ldgemm_kernel_L2_M2_BEGIN
  946. .Ldgemm_kernel_L2_M4_20:
  947. INIT4x2
  948. mov pB, origPB
  949. asr counterL , origK, #3 // counterL = counterL / 8
  950. cmp counterL,#0
  951. ble .Ldgemm_kernel_L2_M4_40
  952. .align 5
  953. .Ldgemm_kernel_L2_M4_22:
  954. KERNEL4x2_SUB
  955. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  956. KERNEL4x2_SUB
  957. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  958. KERNEL4x2_SUB
  959. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  960. KERNEL4x2_SUB
  961. KERNEL4x2_SUB
  962. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  963. KERNEL4x2_SUB
  964. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  965. KERNEL4x2_SUB
  966. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  967. KERNEL4x2_SUB
  968. subs counterL, counterL, #1
  969. bgt .Ldgemm_kernel_L2_M4_22
  970. .Ldgemm_kernel_L2_M4_40:
  971. ands counterL , origK, #7 // counterL = counterL % 8
  972. ble .Ldgemm_kernel_L2_M4_100
  973. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  974. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  975. .Ldgemm_kernel_L2_M4_42:
  976. KERNEL4x2_SUB
  977. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  978. subs counterL, counterL, #1
  979. bgt .Ldgemm_kernel_L2_M4_42
  980. .Ldgemm_kernel_L2_M4_100:
  981. SAVE4x2
  982. .Ldgemm_kernel_L2_M4_END:
  983. .Ldgemm_kernel_L2_M2_BEGIN:
  984. mov counterI, origM
  985. tst counterI , #3
  986. ble .Ldgemm_kernel_L2_END
  987. tst counterI, #2 // counterI = counterI / 2
  988. ble .Ldgemm_kernel_L2_M1_BEGIN
  989. .Ldgemm_kernel_L2_M2_20:
  990. INIT2x2
  991. mov pB, origPB
  992. asr counterL , origK, #3 // counterL = counterL / 8
  993. cmp counterL,#0
  994. ble .Ldgemm_kernel_L2_M2_40
  995. .Ldgemm_kernel_L2_M2_22:
  996. KERNEL2x2_SUB
  997. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  998. KERNEL2x2_SUB
  999. KERNEL2x2_SUB
  1000. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1001. KERNEL2x2_SUB
  1002. KERNEL2x2_SUB
  1003. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1004. KERNEL2x2_SUB
  1005. KERNEL2x2_SUB
  1006. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1007. KERNEL2x2_SUB
  1008. subs counterL, counterL, #1
  1009. bgt .Ldgemm_kernel_L2_M2_22
  1010. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1011. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  1012. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1013. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  1014. .Ldgemm_kernel_L2_M2_40:
  1015. ands counterL , origK, #7 // counterL = counterL % 8
  1016. ble .Ldgemm_kernel_L2_M2_100
  1017. .Ldgemm_kernel_L2_M2_42:
  1018. KERNEL2x2_SUB
  1019. subs counterL, counterL, #1
  1020. bgt .Ldgemm_kernel_L2_M2_42
  1021. .Ldgemm_kernel_L2_M2_100:
  1022. SAVE2x2
  1023. .Ldgemm_kernel_L2_M2_END:
  1024. .Ldgemm_kernel_L2_M1_BEGIN:
  1025. tst counterI, #1 // counterI = counterI % 2
  1026. ble .Ldgemm_kernel_L2_END
  1027. .Ldgemm_kernel_L2_M1_20:
  1028. INIT1x2
  1029. mov pB, origPB
  1030. asr counterL , origK, #3 // counterL = counterL / 8
  1031. cmp counterL, #0
  1032. ble .Ldgemm_kernel_L2_M1_40
  1033. .Ldgemm_kernel_L2_M1_22:
  1034. KERNEL1x2_SUB
  1035. KERNEL1x2_SUB
  1036. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1037. KERNEL1x2_SUB
  1038. KERNEL1x2_SUB
  1039. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1040. KERNEL1x2_SUB
  1041. KERNEL1x2_SUB
  1042. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1043. KERNEL1x2_SUB
  1044. KERNEL1x2_SUB
  1045. subs counterL, counterL, #1
  1046. bgt .Ldgemm_kernel_L2_M1_22
  1047. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1048. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1049. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  1050. .Ldgemm_kernel_L2_M1_40:
  1051. ands counterL , origK, #7 // counterL = counterL % 8
  1052. ble .Ldgemm_kernel_L2_M1_100
  1053. .Ldgemm_kernel_L2_M1_42:
  1054. KERNEL1x2_SUB
  1055. subs counterL, counterL, #1
  1056. bgt .Ldgemm_kernel_L2_M1_42
  1057. .Ldgemm_kernel_L2_M1_100:
  1058. SAVE1x2
  1059. .Ldgemm_kernel_L2_END:
  1060. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1061. /******************************************************************************/
  1062. .Ldgemm_kernel_L1_BEGIN:
  1063. mov counterJ , origN
  1064. tst counterJ , #1
  1065. ble .Ldgemm_kernel_L999 // done
  1066. mov pCRow0, pC // pCRow0 = C
  1067. add pC , pC , LDC // Update pC to point to next
  1068. mov pA, origPA // pA = A
  1069. .Ldgemm_kernel_L1_M8_BEGIN:
  1070. mov counterI, origM
  1071. asr counterI, counterI, #3 // counterI = counterI / 8
  1072. cmp counterI, #0
  1073. ble .Ldgemm_kernel_L1_M4_BEGIN
  1074. .align 5
  1075. .Ldgemm_kernel_L1_M8_20:
  1076. INIT8x1
  1077. mov pB, origPB
  1078. asr counterL , origK, #3 // counterL = counterL / 8
  1079. cmp counterL , #0
  1080. ble .Ldgemm_kernel_L1_M8_40
  1081. .align 5
  1082. .Ldgemm_kernel_L1_M8_22:
  1083. KERNEL8x1_SUB
  1084. KERNEL8x1_SUB
  1085. KERNEL8x1_SUB
  1086. KERNEL8x1_SUB
  1087. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1088. KERNEL8x1_SUB
  1089. KERNEL8x1_SUB
  1090. KERNEL8x1_SUB
  1091. KERNEL8x1_SUB
  1092. subs counterL, counterL, #1
  1093. bgt .Ldgemm_kernel_L1_M8_22
  1094. .Ldgemm_kernel_L1_M8_40:
  1095. ands counterL , origK, #7 // counterL = counterL % 8
  1096. ble .Ldgemm_kernel_L1_M8_100
  1097. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1098. .Ldgemm_kernel_L1_M8_42:
  1099. KERNEL8x1_SUB
  1100. subs counterL, counterL, #1
  1101. bgt .Ldgemm_kernel_L1_M8_42
  1102. .Ldgemm_kernel_L1_M8_100:
  1103. SAVE8x1
  1104. .Ldgemm_kernel_L1_M8_END:
  1105. subs counterI, counterI, #1
  1106. bgt .Ldgemm_kernel_L1_M8_20
  1107. .Ldgemm_kernel_L1_M4_BEGIN:
  1108. mov counterI, origM
  1109. tst counterI , #7
  1110. ble .Ldgemm_kernel_L1_END
  1111. tst counterI, #4 // counterI = counterI / 2
  1112. ble .Ldgemm_kernel_L1_M2_BEGIN
  1113. .Ldgemm_kernel_L1_M4_20:
  1114. INIT4x1
  1115. mov pB, origPB
  1116. asr counterL , origK, #3 // counterL = counterL / 8
  1117. cmp counterL , #0
  1118. ble .Ldgemm_kernel_L1_M4_40
  1119. .align 5
  1120. .Ldgemm_kernel_L1_M4_22:
  1121. KERNEL4x1_SUB
  1122. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1123. KERNEL4x1_SUB
  1124. KERNEL4x1_SUB
  1125. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1126. KERNEL4x1_SUB
  1127. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1128. KERNEL4x1_SUB
  1129. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1130. KERNEL4x1_SUB
  1131. KERNEL4x1_SUB
  1132. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1133. KERNEL4x1_SUB
  1134. subs counterL, counterL, #1
  1135. bgt .Ldgemm_kernel_L1_M4_22
  1136. .Ldgemm_kernel_L1_M4_40:
  1137. ands counterL , origK, #7 // counterL = counterL % 8
  1138. ble .Ldgemm_kernel_L1_M4_100
  1139. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1140. .Ldgemm_kernel_L1_M4_42:
  1141. KERNEL4x1_SUB
  1142. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1143. subs counterL, counterL, #1
  1144. bgt .Ldgemm_kernel_L1_M4_42
  1145. .Ldgemm_kernel_L1_M4_100:
  1146. SAVE4x1
  1147. .Ldgemm_kernel_L1_M4_END:
  1148. .Ldgemm_kernel_L1_M2_BEGIN:
  1149. mov counterI, origM
  1150. tst counterI , #3
  1151. ble .Ldgemm_kernel_L1_END
  1152. tst counterI, #2 // counterI = counterI / 2
  1153. ble .Ldgemm_kernel_L1_M1_BEGIN
  1154. .Ldgemm_kernel_L1_M2_20:
  1155. INIT2x1
  1156. mov pB, origPB
  1157. asr counterL , origK, #3 // counterL = counterL / 8
  1158. cmp counterL , #0
  1159. ble .Ldgemm_kernel_L1_M2_40
  1160. .Ldgemm_kernel_L1_M2_22:
  1161. KERNEL2x1_SUB
  1162. KERNEL2x1_SUB
  1163. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1164. KERNEL2x1_SUB
  1165. KERNEL2x1_SUB
  1166. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1167. KERNEL2x1_SUB
  1168. KERNEL2x1_SUB
  1169. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1170. KERNEL2x1_SUB
  1171. KERNEL2x1_SUB
  1172. subs counterL, counterL, #1
  1173. bgt .Ldgemm_kernel_L1_M2_22
  1174. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1175. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  1176. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1177. .Ldgemm_kernel_L1_M2_40:
  1178. ands counterL , origK, #7 // counterL = counterL % 8
  1179. ble .Ldgemm_kernel_L1_M2_100
  1180. .Ldgemm_kernel_L1_M2_42:
  1181. KERNEL2x1_SUB
  1182. subs counterL, counterL, #1
  1183. bgt .Ldgemm_kernel_L1_M2_42
  1184. .Ldgemm_kernel_L1_M2_100:
  1185. SAVE2x1
  1186. .Ldgemm_kernel_L1_M2_END:
  1187. .Ldgemm_kernel_L1_M1_BEGIN:
  1188. tst counterI, #1 // counterI = counterI % 2
  1189. ble .Ldgemm_kernel_L1_END
  1190. .Ldgemm_kernel_L1_M1_20:
  1191. INIT1x1
  1192. mov pB, origPB
  1193. asr counterL , origK, #3 // counterL = counterL / 8
  1194. cmp counterL , #0
  1195. ble .Ldgemm_kernel_L1_M1_40
  1196. .Ldgemm_kernel_L1_M1_22:
  1197. KERNEL1x1_SUB
  1198. KERNEL1x1_SUB
  1199. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1200. KERNEL1x1_SUB
  1201. KERNEL1x1_SUB
  1202. KERNEL1x1_SUB
  1203. KERNEL1x1_SUB
  1204. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1205. KERNEL1x1_SUB
  1206. KERNEL1x1_SUB
  1207. subs counterL, counterL, #1
  1208. bgt .Ldgemm_kernel_L1_M1_22
  1209. .Ldgemm_kernel_L1_M1_40:
  1210. ands counterL , origK, #7 // counterL = counterL % 8
  1211. ble .Ldgemm_kernel_L1_M1_100
  1212. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  1213. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1214. .Ldgemm_kernel_L1_M1_42:
  1215. KERNEL1x1_SUB
  1216. subs counterL, counterL, #1
  1217. bgt .Ldgemm_kernel_L1_M1_42
  1218. .Ldgemm_kernel_L1_M1_100:
  1219. SAVE1x1
  1220. .Ldgemm_kernel_L1_END:
  1221. .Ldgemm_kernel_L999:
  1222. mov x0, #0 // set return value
  1223. ldp d8, d9, [sp, #(0 * 16)]
  1224. ldp d10, d11, [sp, #(1 * 16)]
  1225. ldp d12, d13, [sp, #(2 * 16)]
  1226. ldp d14, d15, [sp, #(3 * 16)]
  1227. ldp d16, d17, [sp, #(4 * 16)]
  1228. ldp x18, x19, [sp, #(5 * 16)]
  1229. ldp x20, x21, [sp, #(6 * 16)]
  1230. ldp x22, x23, [sp, #(7 * 16)]
  1231. ldp x24, x25, [sp, #(8 * 16)]
  1232. ldp x26, x27, [sp, #(9 * 16)]
  1233. ldr x28, [sp, #(10 * 16)]
  1234. add sp, sp, #(11*16)
  1235. ret
  1236. EPILOGUE