You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x4.S 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define ppC x17
  49. #define ppCRow0 x18
  50. #define ppCRow1 x19
  51. #define ppCRow2 x20
  52. #define ppCRow3 x21
  53. #define ppA x22
  54. #define alpha x23
  55. #define alpha0 d10
  56. #define alphaV0 v10.d[0]
  57. #define A_PRE_SIZE 1024
  58. #define B_PRE_SIZE 1024
  59. #define C_PRE_SIZE 128
  60. // 00 origM
  61. // 01 origN
  62. // 02 origK
  63. // 03 origPA
  64. // 04 origPB
  65. // 05 pC
  66. // 06 origLDC -> LDC
  67. // 07 offset -> temp
  68. // 08 counterL
  69. // 09 counterI
  70. // 10 counterJ
  71. // 11 pB
  72. // 12 pCRow0
  73. // 13 pCRow1
  74. // 14 pCRow2
  75. // 15 pCRow3
  76. // 16 pA
  77. // 17 ppC
  78. // 18 must save ppCRow0
  79. // 19 must save ppCRow1
  80. // 20 must save ppCRow2
  81. // 21 must save ppCRow3
  82. // 22 must save ppA
  83. // 23 must save alpha
  84. // 24 must save
  85. // 25 must save
  86. // 26 must save
  87. // 27 must save
  88. // 28 must save
  89. // 29 frame
  90. // 30 link
  91. // 31 sp
  92. //v00 ALPHA -> pA00, pA01
  93. //v01 pA02, pA03
  94. //v02 ppA00, ppA01
  95. //v03 ppA02, ppA03
  96. //v04 pA10, pA11
  97. //v05 pA12, pA13
  98. //v06 ppA10, ppA11
  99. //v07 ppA12, ppA13
  100. //v08 must save pB00, pB01
  101. //v09 must save pB02, pB03
  102. //v10 must save ALPHA0
  103. //v11 must save
  104. //v12 must save pB10, pB11
  105. //v13 must save pB12, pB13
  106. //v14 must save
  107. //v15 must save
  108. //v16 must save C00, C01
  109. //v17 must save C02, C03
  110. //v18 ppC00, ppC01
  111. //v19 ppC02, ppC03
  112. //v20 C10, C11
  113. //v21 C12, C13
  114. //v22 ppC10, ppC11
  115. //v23 ppC12, ppC13
  116. //v24 C20, C21
  117. //v25 C22, C23
  118. //v26 ppC20, ppC21
  119. //v27 ppC22, ppC23
  120. //v28 C30, C31
  121. //v29 C32, C33
  122. //v30 ppC30, ppC31
  123. //v31 ppC32, ppC33
  124. /*******************************************************************************
  125. * Macro definitions
  126. *******************************************************************************/
  127. .macro INIT8x4
  128. fmov d16, xzr
  129. fmov d17, d16
  130. fmov d18, d17
  131. fmov d19, d16
  132. fmov d20, d17
  133. fmov d21, d16
  134. fmov d22, d17
  135. fmov d23, d16
  136. fmov d24, d17
  137. fmov d25, d16
  138. fmov d26, d17
  139. fmov d27, d16
  140. fmov d28, d17
  141. fmov d29, d16
  142. fmov d30, d17
  143. fmov d31, d16
  144. .endm
  145. .macro KERNEL8x4_I
  146. ldp d8, d9, [pB]
  147. add pB, pB, #16
  148. ldp d10, d11, [pB]
  149. add pB, pB, #16
  150. ldp q0, q1, [pA]
  151. add pA, pA, #32
  152. fmul v16.2d, v0.2d, v8.d[0]
  153. fmul v29.2d, v1.2d, v11.d[0]
  154. ldp q2, q3, [ppA]
  155. add ppA, ppA, #32
  156. fmul v20.2d, v0.2d, v9.d[0]
  157. fmul v25.2d, v1.2d, v10.d[0]
  158. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  159. fmul v18.2d, v2.2d, v8.d[0]
  160. fmul v31.2d, v3.2d, v11.d[0]
  161. prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
  162. fmul v22.2d, v2.2d, v9.d[0]
  163. fmul v27.2d, v3.2d, v10.d[0]
  164. ldp d12, d13, [pB]
  165. add pB, pB, #16
  166. fmul v24.2d, v0.2d, v10.d[0]
  167. fmul v21.2d, v1.2d, v9.d[0]
  168. ldp q4, q5, [pA] // for next round
  169. add pA, pA, #32
  170. fmul v26.2d, v2.2d, v10.d[0]
  171. fmul v23.2d, v3.2d, v9.d[0]
  172. ldp q6, q7, [ppA] // for next round
  173. add ppA, ppA, #32
  174. fmul v28.2d, v0.2d, v11.d[0]
  175. fmul v17.2d, v1.2d, v8.d[0]
  176. ldp d14, d15, [pB]
  177. add pB, pB, #16
  178. fmul v30.2d, v2.2d, v11.d[0]
  179. fmul v19.2d, v3.2d, v8.d[0]
  180. .endm
  181. .macro KERNEL8x4_M2
  182. fmla v16.2d, v4.2d, v12.d[0]
  183. fmla v29.2d, v5.2d, v15.d[0]
  184. ldp d8, d9, [pB]
  185. add pB, pB, #16
  186. fmla v18.2d, v6.2d, v12.d[0]
  187. fmla v31.2d, v7.2d, v15.d[0]
  188. ldp d10, d11, [pB]
  189. add pB, pB, #16
  190. fmla v20.2d, v4.2d, v13.d[0]
  191. fmla v25.2d, v5.2d, v14.d[0]
  192. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  193. fmla v22.2d, v6.2d, v13.d[0]
  194. fmla v27.2d, v7.2d, v14.d[0]
  195. fmla v24.2d, v4.2d, v14.d[0]
  196. fmla v21.2d, v5.2d, v13.d[0]
  197. ldp q0, q1, [pA]
  198. add pA, pA, #32
  199. fmla v26.2d, v6.2d, v14.d[0]
  200. fmla v23.2d, v7.2d, v13.d[0]
  201. fmla v28.2d, v4.2d, v15.d[0]
  202. fmla v17.2d, v5.2d, v12.d[0]
  203. ldp q2, q3, [ppA]
  204. add ppA, ppA, #32
  205. fmla v30.2d, v6.2d, v15.d[0]
  206. fmla v19.2d, v7.2d, v12.d[0]
  207. .endm
  208. .macro KERNEL8x4_M1
  209. fmla v16.2d, v0.2d, v8.d[0]
  210. fmla v29.2d, v1.2d, v11.d[0]
  211. ldp d12, d13, [pB]
  212. add pB, pB, #16
  213. fmla v18.2d, v2.2d, v8.d[0]
  214. fmla v31.2d, v3.2d, v11.d[0]
  215. ldp d14, d15, [pB]
  216. add pB, pB, #16
  217. fmla v20.2d, v0.2d, v9.d[0]
  218. fmla v25.2d, v1.2d, v10.d[0]
  219. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  220. fmla v22.2d, v2.2d, v9.d[0]
  221. fmla v27.2d, v3.2d, v10.d[0]
  222. prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
  223. fmla v24.2d, v0.2d, v10.d[0]
  224. fmla v21.2d, v1.2d, v9.d[0]
  225. ldp q4, q5, [pA]
  226. add pA, pA, #32
  227. fmla v26.2d, v2.2d, v10.d[0]
  228. fmla v23.2d, v3.2d, v9.d[0]
  229. fmla v28.2d, v0.2d, v11.d[0]
  230. fmla v17.2d, v1.2d, v8.d[0]
  231. ldp q6, q7, [ppA]
  232. add ppA, ppA, #32
  233. fmla v30.2d, v2.2d, v11.d[0]
  234. fmla v19.2d, v3.2d, v8.d[0]
  235. .endm
  236. .macro KERNEL8x4_E
  237. fmla v16.2d, v4.2d, v12.d[0]
  238. fmla v25.2d, v5.2d, v14.d[0]
  239. fmla v18.2d, v6.2d, v12.d[0]
  240. fmla v27.2d, v7.2d, v14.d[0]
  241. fmla v20.2d, v4.2d, v13.d[0]
  242. fmla v29.2d, v5.2d, v15.d[0]
  243. fmla v22.2d, v6.2d, v13.d[0]
  244. fmla v31.2d, v7.2d, v15.d[0]
  245. fmla v24.2d, v4.2d, v14.d[0]
  246. fmla v17.2d, v5.2d, v12.d[0]
  247. fmla v26.2d, v6.2d, v14.d[0]
  248. fmla v19.2d, v7.2d, v12.d[0]
  249. fmla v28.2d, v4.2d, v15.d[0]
  250. fmla v21.2d, v5.2d, v13.d[0]
  251. fmla v30.2d, v6.2d, v15.d[0]
  252. fmla v23.2d, v7.2d, v13.d[0]
  253. .endm
  254. .macro KERNEL8x4_SUB
  255. ldp d8, d9, [pB]
  256. add pB, pB, #16
  257. ldp d10, d11, [pB]
  258. add pB, pB, #16
  259. ldp q0, q1, [pA]
  260. add pA, pA, #32
  261. fmla v16.2d, v0.2d, v8.d[0]
  262. fmla v29.2d, v1.2d, v11.d[0]
  263. fmla v20.2d, v0.2d, v9.d[0]
  264. fmla v25.2d, v1.2d, v10.d[0]
  265. ldp q2, q3, [ppA]
  266. add ppA, ppA, #32
  267. fmla v24.2d, v0.2d, v10.d[0]
  268. fmla v21.2d, v1.2d, v9.d[0]
  269. fmla v28.2d, v0.2d, v11.d[0]
  270. fmla v17.2d, v1.2d, v8.d[0]
  271. fmla v18.2d, v2.2d, v8.d[0]
  272. fmla v31.2d, v3.2d, v11.d[0]
  273. fmla v22.2d, v2.2d, v9.d[0]
  274. fmla v27.2d, v3.2d, v10.d[0]
  275. fmla v26.2d, v2.2d, v10.d[0]
  276. fmla v23.2d, v3.2d, v9.d[0]
  277. fmla v30.2d, v2.2d, v11.d[0]
  278. fmla v19.2d, v3.2d, v8.d[0]
  279. .endm
  280. .macro SAVE8x4
  281. fmov alpha0, alpha
  282. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  283. add ppCRow0, pCRow0, #32
  284. ldp q0, q1, [pCRow0]
  285. fmla v0.2d, v16.2d, alphaV0
  286. fmla v1.2d, v17.2d, alphaV0
  287. stp q0, q1, [pCRow0]
  288. add pCRow0, pCRow0, #64
  289. ldp q2, q3, [ppCRow0]
  290. fmla v2.2d, v18.2d, alphaV0
  291. fmla v3.2d, v19.2d, alphaV0
  292. stp q2, q3, [ppCRow0]
  293. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  294. add ppCRow1, pCRow1, #32
  295. ldp q4, q5, [pCRow1]
  296. fmla v4.2d, v20.2d, alphaV0
  297. fmla v5.2d, v21.2d, alphaV0
  298. stp q4, q5, [pCRow1]
  299. add pCRow1, pCRow1, #64
  300. ldp q6, q7, [ppCRow1]
  301. fmla v6.2d, v22.2d, alphaV0
  302. fmla v7.2d, v23.2d, alphaV0
  303. stp q6, q7, [ppCRow1]
  304. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  305. add ppCRow2, pCRow2, #32
  306. ldp q0, q1, [pCRow2]
  307. fmla v0.2d, v24.2d, alphaV0
  308. fmla v1.2d, v25.2d, alphaV0
  309. stp q0, q1, [pCRow2]
  310. add pCRow2, pCRow2, #64
  311. ldp q2, q3, [ppCRow2]
  312. fmla v2.2d, v26.2d, alphaV0
  313. fmla v3.2d, v27.2d, alphaV0
  314. stp q2, q3, [ppCRow2]
  315. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  316. add ppCRow3, pCRow3, #32
  317. ldp q4, q5, [pCRow3]
  318. fmla v4.2d, v28.2d, alphaV0
  319. fmla v5.2d, v29.2d, alphaV0
  320. stp q4, q5, [pCRow3]
  321. add pCRow3, pCRow3, #64
  322. ldp q6, q7, [ppCRow3]
  323. fmla v6.2d, v30.2d, alphaV0
  324. fmla v7.2d, v31.2d, alphaV0
  325. stp q6, q7, [ppCRow3]
  326. .endm
  327. /******************************************************************************/
  328. .macro INIT4x4
  329. fmov d16, xzr
  330. fmov d17, d16
  331. fmov d20, d17
  332. fmov d21, d16
  333. fmov d24, d17
  334. fmov d25, d16
  335. fmov d28, d17
  336. fmov d29, d16
  337. .endm
  338. .macro KERNEL4x4_SUB
  339. ld1 {v8.2d, v9.2d}, [pB]
  340. add pB, pB, #32
  341. ld1 {v0.2d, v1.2d}, [pA]
  342. add pA, pA, #32
  343. fmla v16.2d, v0.2d, v8.d[0]
  344. fmla v29.2d, v1.2d, v9.d[1]
  345. fmla v20.2d, v0.2d, v8.d[1]
  346. fmla v25.2d, v1.2d, v9.d[0]
  347. fmla v24.2d, v0.2d, v9.d[0]
  348. fmla v21.2d, v1.2d, v8.d[1]
  349. fmla v28.2d, v0.2d, v9.d[1]
  350. fmla v17.2d, v1.2d, v8.d[0]
  351. .endm
  352. .macro SAVE4x4
  353. fmov alpha0, alpha
  354. ld1 {v8.2d, v9.2d}, [pCRow0]
  355. fmla v8.2d, v16.2d, alphaV0
  356. fmla v9.2d, v17.2d, alphaV0
  357. st1 {v8.2d, v9.2d}, [pCRow0]
  358. add pCRow1, pCRow0, LDC
  359. ld1 {v12.2d, v13.2d}, [pCRow1]
  360. fmla v12.2d, v20.2d, alphaV0
  361. fmla v13.2d, v21.2d, alphaV0
  362. st1 {v12.2d, v13.2d}, [pCRow1]
  363. add pCRow2, pCRow1, LDC
  364. ld1 {v8.2d, v9.2d}, [pCRow2]
  365. fmla v8.2d, v24.2d, alphaV0
  366. fmla v9.2d, v25.2d, alphaV0
  367. st1 {v8.2d, v9.2d}, [pCRow2]
  368. add pCRow1, pCRow2, LDC
  369. ld1 {v12.2d, v13.2d}, [pCRow1]
  370. fmla v12.2d, v28.2d, alphaV0
  371. fmla v13.2d, v29.2d, alphaV0
  372. st1 {v12.2d, v13.2d}, [pCRow1]
  373. add pCRow0, pCRow0, #32
  374. .endm
  375. /******************************************************************************/
  376. .macro INIT2x4
  377. fmov d16, xzr
  378. fmov d20, d16
  379. fmov d24, d20
  380. fmov d28, d16
  381. .endm
  382. .macro KERNEL2x4_SUB
  383. ld1 {v8.2d, v9.2d}, [pB]
  384. add pB, pB, #32
  385. ld1 {v0.2d}, [pA]
  386. add pA, pA, #16
  387. fmla v16.2d, v0.2d, v8.d[0]
  388. fmla v20.2d, v0.2d, v8.d[1]
  389. fmla v24.2d, v0.2d, v9.d[0]
  390. fmla v28.2d, v0.2d, v9.d[1]
  391. .endm
  392. .macro SAVE2x4
  393. fmov alpha0, alpha
  394. ld1 {v8.2d}, [pCRow0]
  395. fmla v8.2d, v16.2d, alphaV0
  396. st1 {v8.2d}, [pCRow0]
  397. add pCRow1, pCRow0, LDC
  398. ld1 {v12.2d}, [pCRow1]
  399. fmla v12.2d, v20.2d, alphaV0
  400. st1 {v12.2d}, [pCRow1]
  401. add pCRow2, pCRow1, LDC
  402. ld1 {v8.2d}, [pCRow2]
  403. fmla v8.2d, v24.2d, alphaV0
  404. st1 {v8.2d}, [pCRow2]
  405. add pCRow1, pCRow2, LDC
  406. ld1 {v12.2d}, [pCRow1]
  407. fmla v12.2d, v28.2d, alphaV0
  408. st1 {v12.2d}, [pCRow1]
  409. add pCRow0, pCRow0, #16
  410. .endm
  411. /******************************************************************************/
  412. .macro INIT1x4
  413. fmov d16, xzr
  414. fmov d20, d16
  415. .endm
  416. .macro KERNEL1x4_SUB
  417. ldr d0, [pA]
  418. add pA, pA, #8
  419. ld1 {v8.2d, v9.2d}, [pB]
  420. add pB, pB, #32
  421. fmla v16.2d, v8.2d, v0.d[0]
  422. fmla v20.2d, v9.2d, v0.d[0]
  423. .endm
  424. .macro SAVE1x4
  425. fmov alpha0, alpha
  426. add pCRow1, pCRow0, LDC
  427. ld1 {v8.d}[0], [pCRow0]
  428. ld1 {v8.d}[1], [pCRow1]
  429. fmla v8.2d, v16.2d, alphaV0
  430. st1 {v8.d}[0], [pCRow0]
  431. st1 {v8.d}[1], [pCRow1]
  432. add pCRow2, pCRow1, LDC
  433. add pCRow1, pCRow2, LDC
  434. ld1 {v12.d}[0], [pCRow2]
  435. ld1 {v12.d}[1], [pCRow1]
  436. fmla v12.2d, v20.2d, alphaV0
  437. st1 {v12.d}[0], [pCRow2]
  438. st1 {v12.d}[1], [pCRow1]
  439. add pCRow0, pCRow0, #8
  440. .endm
  441. /******************************************************************************/
  442. .macro INIT4x2
  443. fmov d16, xzr
  444. fmov d17, d16
  445. fmov d20, d17
  446. fmov d21, d16
  447. .endm
  448. .macro KERNEL4x2_SUB
  449. ld1 {v8.2d}, [pB]
  450. add pB, pB, #16
  451. ld1 {v0.2d, v1.2d}, [pA]
  452. add pA, pA, #32
  453. fmla v16.2d, v0.2d, v8.d[0]
  454. fmla v17.2d, v1.2d, v8.d[0]
  455. fmla v20.2d, v0.2d, v8.d[1]
  456. fmla v21.2d, v1.2d, v8.d[1]
  457. .endm
  458. .macro SAVE4x2
  459. fmov alpha0, alpha
  460. ld1 {v8.2d, v9.2d}, [pCRow0]
  461. fmla v8.2d, v16.2d, alphaV0
  462. fmla v9.2d, v17.2d, alphaV0
  463. st1 {v8.2d, v9.2d}, [pCRow0]
  464. add pCRow1, pCRow0, LDC
  465. ld1 {v12.2d, v13.2d}, [pCRow1]
  466. fmla v12.2d, v20.2d, alphaV0
  467. fmla v13.2d, v21.2d, alphaV0
  468. st1 {v12.2d, v13.2d}, [pCRow1]
  469. add pCRow0, pCRow0, #32
  470. .endm
  471. /******************************************************************************/
  472. .macro INIT2x2
  473. fmov d16, xzr
  474. fmov d20, d16
  475. .endm
  476. .macro KERNEL2x2_SUB
  477. ld1 {v8.2d}, [pB]
  478. add pB, pB, #16
  479. ld1 {v0.2d}, [pA]
  480. add pA, pA, #16
  481. fmla v16.2d, v0.2d, v8.d[0]
  482. fmla v20.2d, v0.2d, v8.d[1]
  483. .endm
  484. .macro SAVE2x2
  485. fmov alpha0, alpha
  486. ld1 {v8.2d}, [pCRow0]
  487. fmla v8.2d, v16.2d, alphaV0
  488. st1 {v8.2d}, [pCRow0]
  489. add pCRow1 , pCRow0, LDC
  490. ld1 {v12.2d}, [pCRow1]
  491. fmla v12.2d, v20.2d, alphaV0
  492. st1 {v12.2d}, [pCRow1]
  493. add pCRow0, pCRow0, #16
  494. .endm
  495. /******************************************************************************/
  496. .macro INIT1x2
  497. fmov d16, xzr
  498. .endm
  499. .macro KERNEL1x2_SUB
  500. ld1 {v8.2d} , [pB]
  501. add pB , pB, #16
  502. ldr d0 , [pA]
  503. add pA, pA, #8
  504. fmla v16.2d, v8.2d, v0.d[0]
  505. .endm
  506. .macro SAVE1x2
  507. fmov alpha0, alpha
  508. add pCRow1 , pCRow0, LDC
  509. ld1 {v8.d}[0], [pCRow0]
  510. ld1 {v8.d}[1], [pCRow1]
  511. fmla v8.2d, v16.2d, alphaV0
  512. st1 {v8.d}[0], [pCRow0]
  513. st1 {v8.d}[1], [pCRow1]
  514. add pCRow0, pCRow0, #8
  515. .endm
  516. /******************************************************************************/
  517. .macro INIT4x1
  518. fmov d16, xzr
  519. fmov d17, d16
  520. .endm
  521. .macro KERNEL4x1_SUB
  522. ldr d8, [pB]
  523. add pB , pB, #8
  524. ld1 {v0.2d, v1.2d}, [pA]
  525. add pA , pA, #32
  526. fmla v16.2d, v0.2d, v8.d[0]
  527. fmla v17.2d, v1.2d, v8.d[0]
  528. .endm
  529. .macro SAVE4x1
  530. fmov alpha0, alpha
  531. ld1 {v8.2d, v9.2d}, [pCRow0]
  532. fmla v8.2d, v16.2d, alphaV0
  533. fmla v9.2d, v17.2d, alphaV0
  534. st1 {v8.2d, v9.2d}, [pCRow0]
  535. add pCRow0, pCRow0, #32
  536. .endm
  537. /******************************************************************************/
  538. .macro INIT2x1
  539. fmov d16, xzr
  540. .endm
  541. .macro KERNEL2x1_SUB
  542. ldr d8, [pB]
  543. add pB , pB, #8
  544. ld1 {v0.2d}, [pA]
  545. add pA , pA, #16
  546. fmla v16.2d, v0.2d, v8.d[0]
  547. .endm
  548. .macro SAVE2x1
  549. fmov alpha0, alpha
  550. ld1 {v8.2d}, [pCRow0]
  551. fmla v8.2d, v16.2d, alphaV0
  552. st1 {v8.2d}, [pCRow0]
  553. add pCRow0, pCRow0, #16
  554. .endm
  555. /******************************************************************************/
  556. .macro INIT1x1
  557. fmov d16, xzr
  558. .endm
  559. .macro KERNEL1x1_SUB
  560. ldr d8, [pB]
  561. add pB , pB, #8
  562. ldr d0, [pA]
  563. add pA , pA, #8
  564. fmadd d16, d0, d8, d16
  565. .endm
  566. .macro SAVE1x1
  567. fmov alpha0, alpha
  568. ldr d8, [pCRow0]
  569. fmadd d8, d16, alpha0, d8
  570. str d8, [pCRow0]
  571. add pCRow0, pCRow0, #8
  572. .endm
  573. /*******************************************************************************
  574. * End of macro definitions
  575. *******************************************************************************/
  576. PROLOGUE
  577. .align 5
  578. add sp, sp, #-(11 * 16)
  579. stp d8, d9, [sp, #(0 * 16)]
  580. stp d10, d11, [sp, #(1 * 16)]
  581. stp d12, d13, [sp, #(2 * 16)]
  582. stp d14, d15, [sp, #(3 * 16)]
  583. stp d16, d17, [sp, #(4 * 16)]
  584. stp x18, x19, [sp, #(5 * 16)]
  585. stp x20, x21, [sp, #(6 * 16)]
  586. stp x22, x23, [sp, #(7 * 16)]
  587. stp x24, x25, [sp, #(8 * 16)]
  588. stp x26, x27, [sp, #(9 * 16)]
  589. str x28, [sp, #(10 * 16)]
  590. fmov alpha, d0
  591. prfm PLDL1KEEP, [origPA]
  592. prfm PLDL1KEEP, [origPB]
  593. lsl LDC, LDC, #3 // ldc = ldc * 8
  594. mov pB, origPB
  595. mov counterJ, origN
  596. asr counterJ, counterJ, #2 // J = J / 4
  597. cmp counterJ, #0
  598. ble .Ldgemm_kernel_L2_BEGIN
  599. .Ldgemm_kernel_L4_BEGIN:
  600. mov pCRow0, pC
  601. add pCRow1, pCRow0, LDC
  602. add pCRow2, pCRow1, LDC
  603. add pCRow3, pCRow2, LDC
  604. add pC, pCRow3, LDC
  605. lsl temp, origK, #5 // k * 4 * 8
  606. mov pA, origPA // pA = start of A array
  607. add ppA, temp, pA
  608. prfm PLDL1KEEP, [ppA]
  609. //------------------------------------------------------------------------------
  610. .Ldgemm_kernel_L4_M8_BEGIN:
  611. mov counterI, origM
  612. asr counterI, counterI, #3 // counterI = counterI / 8
  613. cmp counterI, #0
  614. ble .Ldgemm_kernel_L4_M4_BEGIN
  615. .align 5
  616. .Ldgemm_kernel_L4_M8_20:
  617. mov pB, origPB
  618. asr counterL , origK, #2 // L = K / 4
  619. cmp counterL , #2
  620. blt .Ldgemm_kernel_L4_M8_32
  621. KERNEL8x4_I
  622. KERNEL8x4_M2
  623. KERNEL8x4_M1
  624. KERNEL8x4_M2
  625. subs counterL, counterL, #2 // subtract 2
  626. ble .Ldgemm_kernel_L4_M8_22a
  627. .align 5
  628. .Ldgemm_kernel_L4_M8_22:
  629. KERNEL8x4_M1
  630. KERNEL8x4_M2
  631. KERNEL8x4_M1
  632. KERNEL8x4_M2
  633. subs counterL, counterL, #1
  634. bgt .Ldgemm_kernel_L4_M8_22
  635. .align 5
  636. .Ldgemm_kernel_L4_M8_22a:
  637. KERNEL8x4_M1
  638. KERNEL8x4_M2
  639. KERNEL8x4_M1
  640. KERNEL8x4_E
  641. b .Ldgemm_kernel_L4_M8_44
  642. .align 5
  643. .Ldgemm_kernel_L4_M8_32:
  644. tst counterL, #1
  645. ble .Ldgemm_kernel_L4_M8_40
  646. KERNEL8x4_I
  647. KERNEL8x4_M2
  648. KERNEL8x4_M1
  649. KERNEL8x4_E
  650. b .Ldgemm_kernel_L4_M8_44
  651. .Ldgemm_kernel_L4_M8_40:
  652. INIT8x4
  653. .Ldgemm_kernel_L4_M8_44:
  654. ands counterL , origK, #3
  655. ble .Ldgemm_kernel_L4_M8_100
  656. .align 5
  657. .Ldgemm_kernel_L4_M8_46:
  658. KERNEL8x4_SUB
  659. subs counterL, counterL, #1
  660. bne .Ldgemm_kernel_L4_M8_46
  661. .Ldgemm_kernel_L4_M8_100:
  662. lsl temp, origK, #5
  663. prfm PLDL1KEEP, [pA, temp]
  664. prfm PLDL1KEEP, [ppA, temp]
  665. prfm PLDL1KEEP, [origPB]
  666. SAVE8x4
  667. .Ldgemm_kernel_L4_M8_END:
  668. lsl temp, origK, #5 // k * 4 * 8
  669. add pA, pA, temp
  670. add ppA, ppA, temp
  671. subs counterI, counterI, #1
  672. bne .Ldgemm_kernel_L4_M8_20
  673. .Ldgemm_kernel_L4_M4_BEGIN:
  674. mov counterI, origM
  675. tst counterI , #7
  676. ble .Ldgemm_kernel_L4_END
  677. tst counterI, #4
  678. ble .Ldgemm_kernel_L4_M2_BEGIN
  679. .Ldgemm_kernel_L4_M4_20:
  680. INIT4x4
  681. mov pB, origPB
  682. asr counterL, origK, #3 // counterL = counterL / 8
  683. cmp counterL, #0
  684. ble .Ldgemm_kernel_L4_M4_40
  685. .Ldgemm_kernel_L4_M4_22:
  686. KERNEL4x4_SUB
  687. KERNEL4x4_SUB
  688. KERNEL4x4_SUB
  689. KERNEL4x4_SUB
  690. KERNEL4x4_SUB
  691. KERNEL4x4_SUB
  692. KERNEL4x4_SUB
  693. KERNEL4x4_SUB
  694. subs counterL, counterL, #1
  695. bgt .Ldgemm_kernel_L4_M4_22
  696. .Ldgemm_kernel_L4_M4_40:
  697. ands counterL , origK, #7 // counterL = counterL % 8
  698. ble .Ldgemm_kernel_L4_M4_100
  699. .Ldgemm_kernel_L4_M4_42:
  700. KERNEL4x4_SUB
  701. subs counterL, counterL, #1
  702. bgt .Ldgemm_kernel_L4_M4_42
  703. .Ldgemm_kernel_L4_M4_100:
  704. SAVE4x4
  705. .Ldgemm_kernel_L4_M4_END:
  706. .Ldgemm_kernel_L4_M2_BEGIN:
  707. mov counterI, origM
  708. tst counterI , #3
  709. ble .Ldgemm_kernel_L4_END
  710. tst counterI, #2 // counterI = counterI / 2
  711. ble .Ldgemm_kernel_L4_M1_BEGIN
  712. .Ldgemm_kernel_L4_M2_20:
  713. INIT2x4
  714. mov pB, origPB
  715. asr counterL , origK, #3 // counterL = counterL / 8
  716. cmp counterL , #0
  717. ble .Ldgemm_kernel_L4_M2_40
  718. .Ldgemm_kernel_L4_M2_22:
  719. KERNEL2x4_SUB
  720. KERNEL2x4_SUB
  721. KERNEL2x4_SUB
  722. KERNEL2x4_SUB
  723. KERNEL2x4_SUB
  724. KERNEL2x4_SUB
  725. KERNEL2x4_SUB
  726. KERNEL2x4_SUB
  727. subs counterL, counterL, #1
  728. bgt .Ldgemm_kernel_L4_M2_22
  729. .Ldgemm_kernel_L4_M2_40:
  730. ands counterL , origK, #7 // counterL = counterL % 8
  731. ble .Ldgemm_kernel_L4_M2_100
  732. .Ldgemm_kernel_L4_M2_42:
  733. KERNEL2x4_SUB
  734. subs counterL, counterL, #1
  735. bgt .Ldgemm_kernel_L4_M2_42
  736. .Ldgemm_kernel_L4_M2_100:
  737. SAVE2x4
  738. .Ldgemm_kernel_L4_M2_END:
  739. .Ldgemm_kernel_L4_M1_BEGIN:
  740. tst counterI, #1 // counterI = counterI % 2
  741. ble .Ldgemm_kernel_L4_END
  742. .Ldgemm_kernel_L4_M1_20:
  743. INIT1x4
  744. mov pB, origPB
  745. asr counterL , origK, #3 // counterL = counterL / 8
  746. cmp counterL , #0
  747. ble .Ldgemm_kernel_L4_M1_40
  748. .Ldgemm_kernel_L4_M1_22:
  749. KERNEL1x4_SUB
  750. KERNEL1x4_SUB
  751. KERNEL1x4_SUB
  752. KERNEL1x4_SUB
  753. KERNEL1x4_SUB
  754. KERNEL1x4_SUB
  755. KERNEL1x4_SUB
  756. KERNEL1x4_SUB
  757. subs counterL, counterL, #1
  758. bgt .Ldgemm_kernel_L4_M1_22
  759. .Ldgemm_kernel_L4_M1_40:
  760. ands counterL , origK, #7 // counterL = counterL % 8
  761. ble .Ldgemm_kernel_L4_M1_100
  762. .Ldgemm_kernel_L4_M1_42:
  763. KERNEL1x4_SUB
  764. subs counterL, counterL, #1
  765. bgt .Ldgemm_kernel_L4_M1_42
  766. .Ldgemm_kernel_L4_M1_100:
  767. SAVE1x4
  768. .Ldgemm_kernel_L4_END:
  769. lsl temp, origK, #5
  770. add origPB, origPB, temp // B = B + K * 4 * 8
  771. subs counterJ, counterJ , #1 // j--
  772. bgt .Ldgemm_kernel_L4_BEGIN
  773. /******************************************************************************/
  774. .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  775. mov counterJ , origN
  776. tst counterJ , #3
  777. ble .Ldgemm_kernel_L999 // error, N was less than 4?
  778. tst counterJ , #2
  779. ble .Ldgemm_kernel_L1_BEGIN
  780. mov pCRow0, pC // pCRow0 = pC
  781. add pC,pC,LDC, lsl #1
  782. mov pA, origPA // pA = A
  783. .Ldgemm_kernel_L2_M4_BEGIN:
  784. mov counterI, origM
  785. asr counterI, counterI, #2 // counterI = counterI / 4
  786. cmp counterI,#0
  787. ble .Ldgemm_kernel_L2_M2_BEGIN
  788. .Ldgemm_kernel_L2_M4_20:
  789. INIT4x2
  790. mov pB, origPB
  791. asr counterL , origK, #3 // counterL = counterL / 8
  792. cmp counterL,#0
  793. ble .Ldgemm_kernel_L2_M4_40
  794. .align 5
  795. .Ldgemm_kernel_L2_M4_22:
  796. KERNEL4x2_SUB
  797. KERNEL4x2_SUB
  798. KERNEL4x2_SUB
  799. KERNEL4x2_SUB
  800. KERNEL4x2_SUB
  801. KERNEL4x2_SUB
  802. KERNEL4x2_SUB
  803. KERNEL4x2_SUB
  804. subs counterL, counterL, #1
  805. bgt .Ldgemm_kernel_L2_M4_22
  806. .Ldgemm_kernel_L2_M4_40:
  807. ands counterL , origK, #7 // counterL = counterL % 8
  808. ble .Ldgemm_kernel_L2_M4_100
  809. .Ldgemm_kernel_L2_M4_42:
  810. KERNEL4x2_SUB
  811. subs counterL, counterL, #1
  812. bgt .Ldgemm_kernel_L2_M4_42
  813. .Ldgemm_kernel_L2_M4_100:
  814. SAVE4x2
  815. .Ldgemm_kernel_L2_M4_END:
  816. subs counterI, counterI, #1
  817. bgt .Ldgemm_kernel_L2_M4_20
  818. .Ldgemm_kernel_L2_M2_BEGIN:
  819. mov counterI, origM
  820. tst counterI , #3
  821. ble .Ldgemm_kernel_L2_END
  822. tst counterI, #2 // counterI = counterI / 2
  823. ble .Ldgemm_kernel_L2_M1_BEGIN
  824. .Ldgemm_kernel_L2_M2_20:
  825. INIT2x2
  826. mov pB, origPB
  827. asr counterL , origK, #3 // counterL = counterL / 8
  828. cmp counterL,#0
  829. ble .Ldgemm_kernel_L2_M2_40
  830. .Ldgemm_kernel_L2_M2_22:
  831. KERNEL2x2_SUB
  832. KERNEL2x2_SUB
  833. KERNEL2x2_SUB
  834. KERNEL2x2_SUB
  835. KERNEL2x2_SUB
  836. KERNEL2x2_SUB
  837. KERNEL2x2_SUB
  838. KERNEL2x2_SUB
  839. subs counterL, counterL, #1
  840. bgt .Ldgemm_kernel_L2_M2_22
  841. .Ldgemm_kernel_L2_M2_40:
  842. ands counterL , origK, #7 // counterL = counterL % 8
  843. ble .Ldgemm_kernel_L2_M2_100
  844. .Ldgemm_kernel_L2_M2_42:
  845. KERNEL2x2_SUB
  846. subs counterL, counterL, #1
  847. bgt .Ldgemm_kernel_L2_M2_42
  848. .Ldgemm_kernel_L2_M2_100:
  849. SAVE2x2
  850. .Ldgemm_kernel_L2_M2_END:
  851. .Ldgemm_kernel_L2_M1_BEGIN:
  852. tst counterI, #1 // counterI = counterI % 2
  853. ble .Ldgemm_kernel_L2_END
  854. .Ldgemm_kernel_L2_M1_20:
  855. INIT1x2
  856. mov pB, origPB
  857. asr counterL , origK, #3 // counterL = counterL / 8
  858. cmp counterL, #0
  859. ble .Ldgemm_kernel_L2_M1_40
  860. .Ldgemm_kernel_L2_M1_22:
  861. KERNEL1x2_SUB
  862. KERNEL1x2_SUB
  863. KERNEL1x2_SUB
  864. KERNEL1x2_SUB
  865. KERNEL1x2_SUB
  866. KERNEL1x2_SUB
  867. KERNEL1x2_SUB
  868. KERNEL1x2_SUB
  869. subs counterL, counterL, #1
  870. bgt .Ldgemm_kernel_L2_M1_22
  871. .Ldgemm_kernel_L2_M1_40:
  872. ands counterL , origK, #7 // counterL = counterL % 8
  873. ble .Ldgemm_kernel_L2_M1_100
  874. .Ldgemm_kernel_L2_M1_42:
  875. KERNEL1x2_SUB
  876. subs counterL, counterL, #1
  877. bgt .Ldgemm_kernel_L2_M1_42
  878. .Ldgemm_kernel_L2_M1_100:
  879. SAVE1x2
  880. .Ldgemm_kernel_L2_END:
  881. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  882. /******************************************************************************/
  883. .Ldgemm_kernel_L1_BEGIN:
  884. mov counterJ , origN
  885. tst counterJ , #1
  886. ble .Ldgemm_kernel_L999 // done
  887. mov pCRow0, pC // pCRow0 = C
  888. add pC , pC , LDC // update pC to point to next
  889. mov pA, origPA // pA = A
  890. .Ldgemm_kernel_L1_M4_BEGIN:
  891. mov counterI, origM
  892. asr counterI, counterI, #2 // counterI = counterI / 4
  893. cmp counterI, #0
  894. ble .Ldgemm_kernel_L1_M2_BEGIN
  895. .Ldgemm_kernel_L1_M4_20:
  896. INIT4x1
  897. mov pB, origPB
  898. asr counterL , origK, #3 // counterL = counterL / 8
  899. cmp counterL , #0
  900. ble .Ldgemm_kernel_L1_M4_40
  901. .align 5
  902. .Ldgemm_kernel_L1_M4_22:
  903. KERNEL4x1_SUB
  904. KERNEL4x1_SUB
  905. KERNEL4x1_SUB
  906. KERNEL4x1_SUB
  907. KERNEL4x1_SUB
  908. KERNEL4x1_SUB
  909. KERNEL4x1_SUB
  910. KERNEL4x1_SUB
  911. subs counterL, counterL, #1
  912. bgt .Ldgemm_kernel_L1_M4_22
  913. .Ldgemm_kernel_L1_M4_40:
  914. ands counterL , origK, #7 // counterL = counterL % 8
  915. ble .Ldgemm_kernel_L1_M4_100
  916. .Ldgemm_kernel_L1_M4_42:
  917. KERNEL4x1_SUB
  918. subs counterL, counterL, #1
  919. bgt .Ldgemm_kernel_L1_M4_42
  920. .Ldgemm_kernel_L1_M4_100:
  921. SAVE4x1
  922. .Ldgemm_kernel_L1_M4_END:
  923. subs counterI, counterI, #1
  924. bgt .Ldgemm_kernel_L1_M4_20
  925. .Ldgemm_kernel_L1_M2_BEGIN:
  926. mov counterI, origM
  927. tst counterI , #3
  928. ble .Ldgemm_kernel_L1_END
  929. tst counterI, #2 // counterI = counterI / 2
  930. ble .Ldgemm_kernel_L1_M1_BEGIN
  931. .Ldgemm_kernel_L1_M2_20:
  932. INIT2x1
  933. mov pB, origPB
  934. asr counterL , origK, #3 // counterL = counterL / 8
  935. cmp counterL , #0
  936. ble .Ldgemm_kernel_L1_M2_40
  937. .Ldgemm_kernel_L1_M2_22:
  938. KERNEL2x1_SUB
  939. KERNEL2x1_SUB
  940. KERNEL2x1_SUB
  941. KERNEL2x1_SUB
  942. KERNEL2x1_SUB
  943. KERNEL2x1_SUB
  944. KERNEL2x1_SUB
  945. KERNEL2x1_SUB
  946. subs counterL, counterL, #1
  947. bgt .Ldgemm_kernel_L1_M2_22
  948. .Ldgemm_kernel_L1_M2_40:
  949. ands counterL , origK, #7 // counterL = counterL % 8
  950. ble .Ldgemm_kernel_L1_M2_100
  951. .Ldgemm_kernel_L1_M2_42:
  952. KERNEL2x1_SUB
  953. subs counterL, counterL, #1
  954. bgt .Ldgemm_kernel_L1_M2_42
  955. .Ldgemm_kernel_L1_M2_100:
  956. SAVE2x1
  957. .Ldgemm_kernel_L1_M2_END:
  958. .Ldgemm_kernel_L1_M1_BEGIN:
  959. tst counterI, #1 // counterI = counterI % 2
  960. ble .Ldgemm_kernel_L1_END
  961. .Ldgemm_kernel_L1_M1_20:
  962. INIT1x1
  963. mov pB, origPB
  964. asr counterL , origK, #3 // counterL = counterL / 8
  965. cmp counterL , #0
  966. ble .Ldgemm_kernel_L1_M1_40
  967. .Ldgemm_kernel_L1_M1_22:
  968. KERNEL1x1_SUB
  969. KERNEL1x1_SUB
  970. KERNEL1x1_SUB
  971. KERNEL1x1_SUB
  972. KERNEL1x1_SUB
  973. KERNEL1x1_SUB
  974. KERNEL1x1_SUB
  975. KERNEL1x1_SUB
  976. subs counterL, counterL, #1
  977. bgt .Ldgemm_kernel_L1_M1_22
  978. .Ldgemm_kernel_L1_M1_40:
  979. ands counterL , origK, #7 // counterL = counterL % 8
  980. ble .Ldgemm_kernel_L1_M1_100
  981. .Ldgemm_kernel_L1_M1_42:
  982. KERNEL1x1_SUB
  983. subs counterL, counterL, #1
  984. bgt .Ldgemm_kernel_L1_M1_42
  985. .Ldgemm_kernel_L1_M1_100:
  986. SAVE1x1
  987. .Ldgemm_kernel_L1_END:
  988. .Ldgemm_kernel_L999:
  989. mov x0, #0 // set return value
  990. ldp d8, d9, [sp, #(0 * 16)]
  991. ldp d10, d11, [sp, #(1 * 16)]
  992. ldp d12, d13, [sp, #(2 * 16)]
  993. ldp d14, d15, [sp, #(3 * 16)]
  994. ldp d16, d17, [sp, #(4 * 16)]
  995. ldp x18, x19, [sp, #(5 * 16)]
  996. ldp x20, x21, [sp, #(6 * 16)]
  997. ldp x22, x23, [sp, #(7 * 16)]
  998. ldp x24, x25, [sp, #(8 * 16)]
  999. ldp x26, x27, [sp, #(9 * 16)]
  1000. ldr x28, [sp, #(10 * 16)]
  1001. add sp, sp, #(11*16)
  1002. ret
  1003. EPILOGUE