You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_4x4.S 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA_0 x15
  47. #define pA_1 x16
  48. #define pA_2 x17
  49. #define pA_3 x18
  50. #define alpha0 s10
  51. #define alphaV0 v10.s[0]
  52. #define alpha1 s11
  53. #define alphaV1 v11.s[0]
  54. #define alpha2 s14
  55. #define alphaV2 v14.s[0]
  56. #define alpha3 s15
  57. #define alphaV3 v15.s[0]
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset -> temp
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA_0
  74. // 16 pA_1
  75. // 17 pA_2
  76. // 18 must save pA_3
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. /***************************** FOR 16x4 ***************************************/
  91. //v00 ALPHA -> pA00_0, pA01_0, pA02_0, pA03_0
  92. //v01 pA10_0, pA11_0, pA12_0, pA13_0
  93. //v02 pA00_1, pA01_1, pA02_1, pA03_1
  94. //v03 pA10_1, pA11_1, pA12_1, pA13_1
  95. //v04 pA00_2, pA01_2, pA02_2, pA03_2
  96. //v05 pA10_2, pA11_2, pA12_2, pA13_2
  97. //v06 pA00_3, pA01_3, pA02_3, pA03_3
  98. //v07 pA10_3, pA11_3, pA12_3, pA13_3
  99. //v08 must save pB00, pB01, pB02, pB03
  100. //v09 must save
  101. //v10 must save ALPHA0
  102. //v11 must save ALPHA1
  103. //v12 must save pB10, pB11, pB12, pB13
  104. //v13 must save
  105. //v14 must save ALPHA2
  106. //v15 must save ALPHA3
  107. //v16 must save C00_0, C01_0, C02_0, C03_0
  108. //v17 must save C10_0, C11_0, C12_0, C13_0
  109. //v18 C20_0, C21_0, C22_0, C23_0
  110. //v19 C30_0, C31_0, C32_0, C33_0
  111. //v20 C00_1, C01_1, C02_1, C03_1
  112. //v21 C10_1, C11_1, C12_1, C13_1
  113. //v22 C20_1, C21_1, C22_1, C23_1
  114. //v23 C30_1, C31_1, C32_1, C33_1
  115. //v24 C00_2, C01_2, C02_2, C03_2
  116. //v25 C10_2, C11_2, C12_2, C13_2
  117. //v26 C20_2, C21_2, C22_2, C23_2
  118. //v27 C30_2, C31_2, C32_2, C33_2
  119. //v28 C00_3, C01_3, C02_3, C03_3
  120. //v29 C10_3, C11_3, C12_3, C13_3
  121. //v30 C20_3, C21_3, C22_3, C23_3
  122. //v31 C30_3, C31_3, C32_3, C33_3
  123. /***************************** EXCEPT FOR 16x4 ********************************/
  124. //v00 ALPHA -> pA00, pA01
  125. //v01 pA02, pA03
  126. //v02 ppA00, ppA01
  127. //v03 ppA02, ppA03
  128. //v04 pA10, pA11
  129. //v05 pA12, pA13
  130. //v06 ppA10, ppA11
  131. //v07 ppA12, ppA13
  132. //v08 must save pB00, pB01
  133. //v09 must save pB02, pB03
  134. //v10 must save ALPHA0
  135. //v11 must save ALPHA1
  136. //v12 must save pB10, pB11
  137. //v13 must save pB12, pB13
  138. //v14 must save ALPHA2
  139. //v15 must save ALPHA3
  140. //v16 must save C00, C01
  141. //v17 must save C02, C03
  142. //v18 ppC00, ppC01
  143. //v19 ppC02, ppC03
  144. //v20 C10, C11
  145. //v21 C12, C13
  146. //v22 ppC10, ppC11
  147. //v23 ppC12, ppC13
  148. //v24 C20, C21
  149. //v25 C22, C23
  150. //v26 ppC20, ppC21
  151. //v27 ppC22, ppC23
  152. //v28 C30, C31
  153. //v29 C32, C33
  154. //v30 ppC30, ppC31
  155. //v31 ppC32, ppC33
  156. /*******************************************************************************
  157. * Macro definitions
  158. *******************************************************************************/
  159. .macro INIT16x4
  160. fmov s16, wzr
  161. fmov s17, s16
  162. fmov s18, s17
  163. fmov s19, s16
  164. fmov s20, s17
  165. fmov s21, s16
  166. fmov s22, s17
  167. fmov s23, s16
  168. fmov s24, s17
  169. fmov s25, s16
  170. fmov s26, s17
  171. fmov s27, s16
  172. fmov s28, s17
  173. fmov s29, s16
  174. fmov s30, s17
  175. fmov s31, s16
  176. .endm
  177. .macro KERNEL16x4_I
  178. ld1 {v8.4s}, [pB]
  179. add pB, pB, #16
  180. ld1 {v0.4s}, [pA_0]
  181. add pA_0, pA_0, #16
  182. fmul v16.4s, v0.4s, v8.s[0]
  183. fmul v20.4s, v0.4s, v8.s[1]
  184. ld1 {v2.4s}, [pA_1]
  185. add pA_1, pA_1, #16
  186. fmul v24.4s, v0.4s, v8.s[2]
  187. fmul v28.4s, v0.4s, v8.s[3]
  188. ld1 {v4.4s}, [pA_2]
  189. add pA_2, pA_2, #16
  190. fmul v17.4s, v2.4s, v8.s[0]
  191. fmul v21.4s, v2.4s, v8.s[1]
  192. ld1 {v6.4s}, [pA_3]
  193. add pA_3, pA_3, #16
  194. fmul v25.4s, v2.4s, v8.s[2]
  195. fmul v29.4s, v2.4s, v8.s[3]
  196. ld1 {v12.4s}, [pB] // for next round
  197. add pB, pB, #16
  198. fmul v18.4s, v4.4s, v8.s[0]
  199. fmul v19.4s, v6.4s, v8.s[0]
  200. ld1 {v1.4s}, [pA_0] // for next round
  201. add pA_0, pA_0, #16
  202. fmul v22.4s, v4.4s, v8.s[1]
  203. fmul v23.4s, v6.4s, v8.s[1]
  204. ld1 {v3.4s}, [pA_1] // for next round
  205. add pA_1, pA_1, #16
  206. fmul v26.4s, v4.4s, v8.s[2]
  207. fmul v27.4s, v6.4s, v8.s[2]
  208. ld1 {v5.4s}, [pA_2] // for next round
  209. add pA_2, pA_2, #16
  210. fmul v30.4s, v4.4s, v8.s[3]
  211. fmul v31.4s, v6.4s, v8.s[3]
  212. ld1 {v7.4s}, [pA_3] // for next round
  213. add pA_3, pA_3, #16
  214. .endm
  215. .macro KERNEL16x4_M2
  216. fmla v16.4s, v1.4s, v12.s[0]
  217. fmla v17.4s, v3.4s, v12.s[0]
  218. ld1 {v8.4s}, [pB] // for next round
  219. add pB, pB, #16
  220. fmla v18.4s, v5.4s, v12.s[0]
  221. fmla v19.4s, v7.4s, v12.s[0]
  222. ld1 {v0.4s}, [pA_0] // for next round
  223. add pA_0, pA_0, #16
  224. fmla v20.4s, v1.4s, v12.s[1]
  225. fmla v21.4s, v3.4s, v12.s[1]
  226. ld1 {v2.4s}, [pA_1] // for next round
  227. add pA_1, pA_1, #16
  228. fmla v22.4s, v5.4s, v12.s[1]
  229. fmla v23.4s, v7.4s, v12.s[1]
  230. ld1 {v4.4s}, [pA_2] // for next round
  231. add pA_2, pA_2, #16
  232. fmla v24.4s, v1.4s, v12.s[2]
  233. fmla v25.4s, v3.4s, v12.s[2]
  234. ld1 {v6.4s}, [pA_3] // for next round
  235. add pA_3, pA_3, #16
  236. fmla v26.4s, v5.4s, v12.s[2]
  237. fmla v27.4s, v7.4s, v12.s[2]
  238. prfm PLDL1KEEP, [pA_2, #512]
  239. fmla v28.4s, v1.4s, v12.s[3]
  240. fmla v29.4s, v3.4s, v12.s[3]
  241. prfm PLDL1KEEP, [pA_3, #512]
  242. fmla v30.4s, v5.4s, v12.s[3]
  243. fmla v31.4s, v7.4s, v12.s[3]
  244. prfm PLDL1KEEP, [pB, #512]
  245. .endm
  246. .macro KERNEL16x4_M1
  247. fmla v16.4s, v0.4s, v8.s[0]
  248. fmla v17.4s, v2.4s, v8.s[0]
  249. ld1 {v12.4s}, [pB] // for next round
  250. add pB, pB, #16
  251. fmla v18.4s, v4.4s, v8.s[0]
  252. fmla v19.4s, v6.4s, v8.s[0]
  253. ld1 {v1.4s}, [pA_0] // for next round
  254. add pA_0, pA_0, #16
  255. fmla v20.4s, v0.4s, v8.s[1]
  256. fmla v21.4s, v2.4s, v8.s[1]
  257. ld1 {v3.4s}, [pA_1] // for next round
  258. add pA_1, pA_1, #16
  259. fmla v22.4s, v4.4s, v8.s[1]
  260. fmla v23.4s, v6.4s, v8.s[1]
  261. ld1 {v5.4s}, [pA_2] // for next round
  262. add pA_2, pA_2, #16
  263. fmla v24.4s, v0.4s, v8.s[2]
  264. fmla v25.4s, v2.4s, v8.s[2]
  265. ld1 {v7.4s}, [pA_3] // for next round
  266. add pA_3, pA_3, #16
  267. fmla v26.4s, v4.4s, v8.s[2]
  268. fmla v27.4s, v6.4s, v8.s[2]
  269. prfm PLDL1KEEP, [pA_0, #512]
  270. fmla v28.4s, v0.4s, v8.s[3]
  271. fmla v29.4s, v2.4s, v8.s[3]
  272. prfm PLDL1KEEP, [pA_1, #512]
  273. fmla v30.4s, v4.4s, v8.s[3]
  274. fmla v31.4s, v6.4s, v8.s[3]
  275. .endm
  276. .macro KERNEL16x4_E
  277. fmla v16.4s, v1.4s, v12.s[0]
  278. fmla v17.4s, v3.4s, v12.s[0]
  279. fmla v18.4s, v5.4s, v12.s[0]
  280. fmla v19.4s, v7.4s, v12.s[0]
  281. fmla v20.4s, v1.4s, v12.s[1]
  282. fmla v21.4s, v3.4s, v12.s[1]
  283. fmla v22.4s, v5.4s, v12.s[1]
  284. fmla v23.4s, v7.4s, v12.s[1]
  285. fmla v24.4s, v1.4s, v12.s[2]
  286. fmla v25.4s, v3.4s, v12.s[2]
  287. fmla v26.4s, v5.4s, v12.s[2]
  288. fmla v27.4s, v7.4s, v12.s[2]
  289. fmla v28.4s, v1.4s, v12.s[3]
  290. fmla v29.4s, v3.4s, v12.s[3]
  291. fmla v30.4s, v5.4s, v12.s[3]
  292. fmla v31.4s, v7.4s, v12.s[3]
  293. .endm
  294. .macro KERNEL16x4_SUB
  295. ld1 {v8.4s}, [pB]
  296. add pB, pB, #16
  297. ld1 {v0.4s}, [pA_0]
  298. add pA_0, pA_0, #16
  299. fmla v16.4s, v0.4s, v8.s[0]
  300. fmla v20.4s, v0.4s, v8.s[1]
  301. fmla v24.4s, v0.4s, v8.s[2]
  302. fmla v28.4s, v0.4s, v8.s[3]
  303. ld1 {v2.4s}, [pA_1]
  304. add pA_1, pA_1, #16
  305. fmla v17.4s, v2.4s, v8.s[0]
  306. fmla v21.4s, v2.4s, v8.s[1]
  307. fmla v25.4s, v2.4s, v8.s[2]
  308. fmla v29.4s, v2.4s, v8.s[3]
  309. ld1 {v4.4s}, [pA_2]
  310. add pA_2, pA_2, #16
  311. fmla v18.4s, v4.4s, v8.s[0]
  312. fmla v22.4s, v4.4s, v8.s[1]
  313. fmla v26.4s, v4.4s, v8.s[2]
  314. fmla v30.4s, v4.4s, v8.s[3]
  315. ld1 {v6.4s}, [pA_3]
  316. add pA_3, pA_3, #16
  317. fmla v19.4s, v6.4s, v8.s[0]
  318. fmla v23.4s, v6.4s, v8.s[1]
  319. fmla v27.4s, v6.4s, v8.s[2]
  320. fmla v31.4s, v6.4s, v8.s[3]
  321. .endm
  322. .macro SAVE16x4
  323. mov pCRow1, pCRow0
  324. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1]
  325. fmla v0.4s, v16.4s, alphaV0
  326. fmla v1.4s, v17.4s, alphaV1
  327. fmla v2.4s, v18.4s, alphaV2
  328. fmla v3.4s, v19.4s, alphaV3
  329. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1]
  330. add pCRow1, pCRow1, LDC
  331. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  332. fmla v4.4s, v20.4s, alphaV0
  333. fmla v5.4s, v21.4s, alphaV1
  334. fmla v6.4s, v22.4s, alphaV2
  335. fmla v7.4s, v23.4s, alphaV3
  336. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  337. add pCRow1, pCRow1, LDC
  338. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1]
  339. fmla v0.4s, v24.4s, alphaV0
  340. fmla v1.4s, v25.4s, alphaV1
  341. fmla v2.4s, v26.4s, alphaV2
  342. fmla v3.4s, v27.4s, alphaV3
  343. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1]
  344. add pCRow1, pCRow1, LDC
  345. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  346. fmla v4.4s, v28.4s, alphaV0
  347. fmla v5.4s, v29.4s, alphaV1
  348. fmla v6.4s, v30.4s, alphaV2
  349. fmla v7.4s, v31.4s, alphaV3
  350. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  351. add pCRow0, pCRow0, #64
  352. .endm
  353. /******************************************************************************/
  354. .macro INIT8x4
  355. fmov s16, wzr
  356. fmov s17, s16
  357. fmov s18, s17
  358. fmov s19, s16
  359. fmov s20, s17
  360. fmov s21, s16
  361. fmov s22, s17
  362. fmov s23, s16
  363. fmov s24, s17
  364. fmov s25, s16
  365. fmov s26, s17
  366. fmov s27, s16
  367. fmov s28, s17
  368. fmov s29, s16
  369. fmov s30, s17
  370. fmov s31, s16
  371. .endm
  372. .macro KERNEL8x4_SUB
  373. ld1 {v8.2s, v9.2s}, [pB]
  374. add pB, pB, #16
  375. ld1 {v0.2s, v1.2s}, [pA_0]
  376. add pA_0, pA_0, #16
  377. fmla v16.2s, v0.2s, v8.s[0]
  378. fmla v29.2s, v1.2s, v9.s[1]
  379. fmla v20.2s, v0.2s, v8.s[1]
  380. fmla v25.2s, v1.2s, v9.s[0]
  381. ld1 {v2.2s, v3.2s}, [pA_1]
  382. add pA_1, pA_1, #16
  383. fmla v24.2s, v0.2s, v9.s[0]
  384. fmla v21.2s, v1.2s, v8.s[1]
  385. fmla v28.2s, v0.2s, v9.s[1]
  386. fmla v17.2s, v1.2s, v8.s[0]
  387. fmla v18.2s, v2.2s, v8.s[0]
  388. fmla v31.2s, v3.2s, v9.s[1]
  389. fmla v22.2s, v2.2s, v8.s[1]
  390. fmla v27.2s, v3.2s, v9.s[0]
  391. fmla v26.2s, v2.2s, v9.s[0]
  392. fmla v23.2s, v3.2s, v8.s[1]
  393. fmla v30.2s, v2.2s, v9.s[1]
  394. fmla v19.2s, v3.2s, v8.s[0]
  395. .endm
  396. .macro SAVE8x4
  397. mov pCRow1, pCRow0
  398. ld1 {v0.2s, v1.2s}, [pCRow1]
  399. fmla v0.2s, v16.2s, alphaV0
  400. fmla v1.2s, v17.2s, alphaV1
  401. st1 {v0.2s, v1.2s}, [pCRow1]
  402. add pCRow2, pCRow1, LDC
  403. add pCRow1, pCRow1, #16
  404. ld1 {v2.2s, v3.2s}, [pCRow1]
  405. fmla v2.2s, v18.2s, alphaV2
  406. fmla v3.2s, v19.2s, alphaV3
  407. st1 {v2.2s, v3.2s}, [pCRow1]
  408. ld1 {v4.2s, v5.2s}, [pCRow2]
  409. fmla v4.2s, v20.2s, alphaV0
  410. fmla v5.2s, v21.2s, alphaV1
  411. st1 {v4.2s, v5.2s}, [pCRow2]
  412. add pCRow1, pCRow2, LDC
  413. add pCRow2, pCRow2, #16
  414. ld1 {v6.2s, v7.2s}, [pCRow2]
  415. fmla v6.2s, v22.2s, alphaV2
  416. fmla v7.2s, v23.2s, alphaV3
  417. st1 {v6.2s, v7.2s}, [pCRow2]
  418. ld1 {v0.2s, v1.2s}, [pCRow1]
  419. fmla v0.2s, v24.2s, alphaV0
  420. fmla v1.2s, v25.2s, alphaV1
  421. st1 {v0.2s, v1.2s}, [pCRow1]
  422. add pCRow2, pCRow1, LDC
  423. add pCRow1, pCRow1, #16
  424. ld1 {v2.2s, v3.2s}, [pCRow1]
  425. fmla v2.2s, v26.2s, alphaV2
  426. fmla v3.2s, v27.2s, alphaV3
  427. st1 {v2.2s, v3.2s}, [pCRow1]
  428. ld1 {v4.2s, v5.2s}, [pCRow2]
  429. fmla v4.2s, v28.2s, alphaV0
  430. fmla v5.2s, v29.2s, alphaV1
  431. st1 {v4.2s, v5.2s}, [pCRow2]
  432. add pCRow2, pCRow2, #16
  433. ld1 {v6.2s, v7.2s}, [pCRow2]
  434. fmla v6.2s, v30.2s, alphaV2
  435. fmla v7.2s, v31.2s, alphaV3
  436. st1 {v6.2s, v7.2s}, [pCRow2]
  437. add pCRow0, pCRow0, #32
  438. .endm
  439. /******************************************************************************/
  440. .macro INIT4x4
  441. fmov s16, wzr
  442. fmov s17, s16
  443. fmov s20, s17
  444. fmov s21, s16
  445. fmov s24, s17
  446. fmov s25, s16
  447. fmov s28, s17
  448. fmov s29, s16
  449. .endm
  450. .macro KERNEL4x4_SUB
  451. ld1 {v8.2s, v9.2s}, [pB]
  452. add pB, pB, #16
  453. ld1 {v0.2s, v1.2s}, [pA_0]
  454. add pA_0, pA_0, #16
  455. fmla v16.2s, v0.2s, v8.s[0]
  456. fmla v29.2s, v1.2s, v9.s[1]
  457. fmla v20.2s, v0.2s, v8.s[1]
  458. fmla v25.2s, v1.2s, v9.s[0]
  459. fmla v24.2s, v0.2s, v9.s[0]
  460. fmla v21.2s, v1.2s, v8.s[1]
  461. fmla v28.2s, v0.2s, v9.s[1]
  462. fmla v17.2s, v1.2s, v8.s[0]
  463. .endm
  464. .macro SAVE4x4
  465. ld1 {v8.2s, v9.2s}, [pCRow0]
  466. fmla v8.2s, v16.2s, alphaV0
  467. fmla v9.2s, v17.2s, alphaV1
  468. st1 {v8.2s, v9.2s}, [pCRow0]
  469. add pCRow1, pCRow0, LDC
  470. ld1 {v12.2s, v13.2s}, [pCRow1]
  471. fmla v12.2s, v20.2s, alphaV2
  472. fmla v13.2s, v21.2s, alphaV3
  473. st1 {v12.2s, v13.2s}, [pCRow1]
  474. add pCRow2, pCRow1, LDC
  475. ld1 {v8.2s, v9.2s}, [pCRow2]
  476. fmla v8.2s, v24.2s, alphaV0
  477. fmla v9.2s, v25.2s, alphaV1
  478. st1 {v8.2s, v9.2s}, [pCRow2]
  479. add pCRow1, pCRow2, LDC
  480. ld1 {v12.2s, v13.2s}, [pCRow1]
  481. fmla v12.2s, v28.2s, alphaV2
  482. fmla v13.2s, v29.2s, alphaV3
  483. st1 {v12.2s, v13.2s}, [pCRow1]
  484. add pCRow0, pCRow0, #16
  485. .endm
  486. /******************************************************************************/
  487. .macro INIT2x4
  488. fmov s16, wzr
  489. fmov s20, s16
  490. fmov s24, s20
  491. fmov s28, s16
  492. .endm
  493. .macro KERNEL2x4_SUB
  494. ld1 {v8.2s, v9.2s}, [pB]
  495. add pB, pB, #16
  496. ld1 {v0.2s}, [pA_0]
  497. add pA_0, pA_0, #8
  498. fmla v16.2s, v0.2s, v8.s[0]
  499. fmla v20.2s, v0.2s, v8.s[1]
  500. fmla v24.2s, v0.2s, v9.s[0]
  501. fmla v28.2s, v0.2s, v9.s[1]
  502. .endm
  503. .macro SAVE2x4
  504. ld1 {v8.2s}, [pCRow0]
  505. fmla v8.2s, v16.2s, alphaV0
  506. st1 {v8.2s}, [pCRow0]
  507. add pCRow1, pCRow0, LDC
  508. ld1 {v12.2s}, [pCRow1]
  509. fmla v12.2s, v20.2s, alphaV1
  510. st1 {v12.2s}, [pCRow1]
  511. add pCRow2, pCRow1, LDC
  512. ld1 {v8.2s}, [pCRow2]
  513. fmla v8.2s, v24.2s, alphaV2
  514. st1 {v8.2s}, [pCRow2]
  515. add pCRow1, pCRow2, LDC
  516. ld1 {v12.2s}, [pCRow1]
  517. fmla v12.2s, v28.2s, alphaV3
  518. st1 {v12.2s}, [pCRow1]
  519. add pCRow0, pCRow0, #8
  520. .endm
  521. /******************************************************************************/
  522. .macro INIT1x4
  523. fmov s16, wzr
  524. fmov s20, s16
  525. .endm
  526. .macro KERNEL1x4_SUB
  527. ldr s0, [pA_0]
  528. add pA_0, pA_0, #4
  529. ld1 {v8.2s, v9.2s}, [pB]
  530. add pB, pB, #16
  531. fmla v16.2s, v8.2s, v0.s[0]
  532. fmla v20.2s, v9.2s, v0.s[0]
  533. .endm
  534. .macro SAVE1x4
  535. add pCRow1, pCRow0, LDC
  536. ld1 {v8.s}[0], [pCRow0]
  537. ld1 {v8.s}[1], [pCRow1]
  538. fmla v8.2s, v16.2s, alphaV0
  539. st1 {v8.s}[0], [pCRow0]
  540. st1 {v8.s}[1], [pCRow1]
  541. add pCRow2, pCRow1, LDC
  542. add pCRow1, pCRow2, LDC
  543. ld1 {v12.s}[0], [pCRow2]
  544. ld1 {v12.s}[1], [pCRow1]
  545. fmla v12.2s, v20.2s, alphaV1
  546. st1 {v12.s}[0], [pCRow2]
  547. st1 {v12.s}[1], [pCRow1]
  548. add pCRow0, pCRow0, #4
  549. .endm
  550. /******************************************************************************/
  551. .macro INIT4x2
  552. fmov s16, wzr
  553. fmov s17, s16
  554. fmov s20, s17
  555. fmov s21, s16
  556. .endm
  557. .macro KERNEL4x2_SUB
  558. ld1 {v8.2s}, [pB]
  559. add pB, pB, #8
  560. ld1 {v0.2s, v1.2s}, [pA_0]
  561. add pA_0, pA_0, #16
  562. fmla v16.2s, v0.2s, v8.s[0]
  563. fmla v17.2s, v1.2s, v8.s[0]
  564. fmla v20.2s, v0.2s, v8.s[1]
  565. fmla v21.2s, v1.2s, v8.s[1]
  566. .endm
  567. .macro SAVE4x2
  568. ld1 {v8.2s, v9.2s}, [pCRow0]
  569. fmla v8.2s, v16.2s, alphaV0
  570. fmla v9.2s, v17.2s, alphaV1
  571. st1 {v8.2s, v9.2s}, [pCRow0]
  572. add pCRow1, pCRow0, LDC
  573. ld1 {v12.2s, v13.2s}, [pCRow1]
  574. fmla v12.2s, v20.2s, alphaV2
  575. fmla v13.2s, v21.2s, alphaV3
  576. st1 {v12.2s, v13.2s}, [pCRow1]
  577. add pCRow0, pCRow0, #16
  578. .endm
  579. /******************************************************************************/
  580. .macro INIT2x2
  581. fmov s16, wzr
  582. fmov s20, s16
  583. .endm
  584. .macro KERNEL2x2_SUB
  585. ld1 {v8.2s}, [pB]
  586. add pB, pB, #8
  587. ld1 {v0.2s}, [pA_0]
  588. add pA_0, pA_0, #8
  589. fmla v16.2s, v0.2s, v8.s[0]
  590. fmla v20.2s, v0.2s, v8.s[1]
  591. .endm
  592. .macro SAVE2x2
  593. ld1 {v8.2s}, [pCRow0]
  594. fmla v8.2s, v16.2s, alphaV0
  595. st1 {v8.2s}, [pCRow0]
  596. add pCRow1 , pCRow0, LDC
  597. ld1 {v12.2s}, [pCRow1]
  598. fmla v12.2s, v20.2s, alphaV1
  599. st1 {v12.2s}, [pCRow1]
  600. add pCRow0, pCRow0, #8
  601. .endm
  602. /******************************************************************************/
  603. .macro INIT1x2
  604. fmov s16, wzr
  605. .endm
  606. .macro KERNEL1x2_SUB
  607. ld1 {v8.2s} , [pB]
  608. add pB , pB, #8
  609. ldr s0 , [pA_0]
  610. add pA_0, pA_0, #4
  611. fmla v16.2s, v8.2s, v0.s[0]
  612. .endm
  613. .macro SAVE1x2
  614. add pCRow1 , pCRow0, LDC
  615. ld1 {v8.s}[0], [pCRow0]
  616. ld1 {v8.s}[1], [pCRow1]
  617. fmla v8.2s, v16.2s, alphaV0
  618. st1 {v8.s}[0], [pCRow0]
  619. st1 {v8.s}[1], [pCRow1]
  620. add pCRow0, pCRow0, #4
  621. .endm
  622. /******************************************************************************/
  623. .macro INIT4x1
  624. fmov s16, wzr
  625. fmov s17, s16
  626. .endm
  627. .macro KERNEL4x1_SUB
  628. ldr s8, [pB]
  629. add pB , pB, #4
  630. ld1 {v0.2s, v1.2s}, [pA_0]
  631. add pA_0 , pA_0, #16
  632. fmla v16.2s, v0.2s, v8.s[0]
  633. fmla v17.2s, v1.2s, v8.s[0]
  634. .endm
  635. .macro SAVE4x1
  636. ld1 {v8.2s, v9.2s}, [pCRow0]
  637. fmla v8.2s, v16.2s, alphaV0
  638. fmla v9.2s, v17.2s, alphaV1
  639. st1 {v8.2s, v9.2s}, [pCRow0]
  640. add pCRow0, pCRow0, #16
  641. .endm
  642. /******************************************************************************/
  643. .macro INIT2x1
  644. fmov s16, wzr
  645. .endm
  646. .macro KERNEL2x1_SUB
  647. ldr s8, [pB]
  648. add pB , pB, #4
  649. ld1 {v0.2s}, [pA_0]
  650. add pA_0 , pA_0, #8
  651. fmla v16.2s, v0.2s, v8.s[0]
  652. .endm
  653. .macro SAVE2x1
  654. ld1 {v8.2s}, [pCRow0]
  655. fmla v8.2s, v16.2s, alphaV0
  656. st1 {v8.2s}, [pCRow0]
  657. add pCRow0, pCRow0, #8
  658. .endm
  659. /******************************************************************************/
  660. .macro INIT1x1
  661. fmov s16, wzr
  662. .endm
  663. .macro KERNEL1x1_SUB
  664. ldr s8, [pB]
  665. add pB , pB, #4
  666. ldr s0, [pA_0]
  667. add pA_0 , pA_0, #4
  668. fmadd s16, s0, s8, s16
  669. .endm
  670. .macro SAVE1x1
  671. ldr s8, [pCRow0]
  672. fmadd s8, s16, alpha0, s8
  673. str s8, [pCRow0]
  674. add pCRow0, pCRow0, #4
  675. .endm
  676. /*******************************************************************************
  677. * End of macro definitions
  678. *******************************************************************************/
  679. PROLOGUE
  680. .align 5
  681. add sp, sp, #-(11 * 16)
  682. stp d8, d9, [sp, #(0 * 16)]
  683. stp d10, d11, [sp, #(1 * 16)]
  684. stp d12, d13, [sp, #(2 * 16)]
  685. stp d14, d15, [sp, #(3 * 16)]
  686. stp d16, d17, [sp, #(4 * 16)]
  687. stp x18, x19, [sp, #(5 * 16)]
  688. stp x20, x21, [sp, #(6 * 16)]
  689. stp x22, x23, [sp, #(7 * 16)]
  690. stp x24, x25, [sp, #(8 * 16)]
  691. stp x26, x27, [sp, #(9 * 16)]
  692. str x28, [sp, #(10 * 16)]
  693. fmov alpha0, s0
  694. fmov alpha1, s0
  695. fmov alpha2, s0
  696. fmov alpha3, s0
  697. lsl LDC, LDC, #2 // ldc = ldc * 4
  698. mov pB, origPB
  699. mov counterJ, origN
  700. asr counterJ, counterJ, #2 // J = J / 4
  701. cmp counterJ, #0
  702. ble .Lsgemm_kernel_L2_BEGIN
  703. /******************************************************************************/
  704. .Lsgemm_kernel_L4_BEGIN:
  705. mov pCRow0, pC // pCRow0 = C
  706. add pC, pC, LDC, lsl #2
  707. lsl temp, origK, #4 // k * 4 * 4
  708. mov pA_0, origPA // pA_0 = start of A array
  709. add pA_1, temp, pA_0
  710. add pA_2, temp, pA_1
  711. add pA_3, temp, pA_2
  712. .Lsgemm_kernel_L4_M16_BEGIN:
  713. mov counterI, origM
  714. asr counterI, counterI, #4 // counterI = counterI / 16
  715. cmp counterI, #0
  716. ble .Lsgemm_kernel_L4_M8_BEGIN
  717. .Lsgemm_kernel_L4_M16_20:
  718. mov pB, origPB
  719. asr counterL , origK, #1 // L = K / 2
  720. cmp counterL , #2 // is there at least 4 to do?
  721. blt .Lsgemm_kernel_L4_M16_32
  722. KERNEL16x4_I // do one in the K
  723. KERNEL16x4_M2 // do another in the K
  724. subs counterL, counterL, #2
  725. ble .Lsgemm_kernel_L4_M16_22a
  726. .align 5
  727. .Lsgemm_kernel_L4_M16_22:
  728. KERNEL16x4_M1
  729. KERNEL16x4_M2
  730. subs counterL, counterL, #1
  731. bgt .Lsgemm_kernel_L4_M16_22
  732. .Lsgemm_kernel_L4_M16_22a:
  733. KERNEL16x4_M1
  734. KERNEL16x4_E
  735. b .Lsgemm_kernel_L4_M16_44
  736. .Lsgemm_kernel_L4_M16_32:
  737. tst counterL, #1
  738. ble .Lsgemm_kernel_L4_M16_40
  739. KERNEL16x4_I
  740. KERNEL16x4_E
  741. b .Lsgemm_kernel_L4_M16_44
  742. .Lsgemm_kernel_L4_M16_40:
  743. INIT16x4
  744. .Lsgemm_kernel_L4_M16_44:
  745. ands counterL , origK, #1
  746. ble .Lsgemm_kernel_L4_M16_100
  747. .Lsgemm_kernel_L4_M16_46:
  748. KERNEL16x4_SUB
  749. .Lsgemm_kernel_L4_M16_100:
  750. SAVE16x4
  751. .Lsgemm_kernel_L4_M16_END:
  752. lsl temp, origK, #4 // k * 4 * 4 = Four rows of A
  753. add pA_0, pA_0, temp
  754. add pA_0, pA_0, temp
  755. add pA_0, pA_0, temp
  756. add pA_1, pA_0, temp
  757. add pA_2, pA_1, temp
  758. add pA_3, pA_2, temp
  759. subs counterI, counterI, #1
  760. bne .Lsgemm_kernel_L4_M16_20
  761. .Lsgemm_kernel_L4_M8_BEGIN:
  762. mov counterI, origM
  763. tst counterI , #15
  764. ble .Lsgemm_kernel_L4_END
  765. tst counterI, #8
  766. ble .Lsgemm_kernel_L4_M4_BEGIN
  767. .Lsgemm_kernel_L4_M8_20:
  768. INIT8x4
  769. mov pB, origPB
  770. asr counterL, origK, #3 // counterL = counterL / 8
  771. cmp counterL, #0
  772. ble .Lsgemm_kernel_L4_M8_40
  773. .Lsgemm_kernel_L4_M8_22:
  774. KERNEL8x4_SUB
  775. KERNEL8x4_SUB
  776. KERNEL8x4_SUB
  777. KERNEL8x4_SUB
  778. KERNEL8x4_SUB
  779. KERNEL8x4_SUB
  780. KERNEL8x4_SUB
  781. KERNEL8x4_SUB
  782. subs counterL, counterL, #1
  783. bgt .Lsgemm_kernel_L4_M8_22
  784. .Lsgemm_kernel_L4_M8_40:
  785. ands counterL , origK, #7 // counterL = counterL % 8
  786. ble .Lsgemm_kernel_L4_M8_100
  787. .Lsgemm_kernel_L4_M8_42:
  788. KERNEL8x4_SUB
  789. subs counterL, counterL, #1
  790. bgt .Lsgemm_kernel_L4_M8_42
  791. .Lsgemm_kernel_L4_M8_100:
  792. SAVE8x4
  793. .Lsgemm_kernel_L4_M8_END:
  794. lsl temp, origK, #4 // k * 4 * 4
  795. add pA_0, pA_0, temp
  796. .Lsgemm_kernel_L4_M4_BEGIN:
  797. mov counterI, origM
  798. tst counterI , #7
  799. ble .Lsgemm_kernel_L4_END
  800. tst counterI, #4
  801. ble .Lsgemm_kernel_L4_M2_BEGIN
  802. .Lsgemm_kernel_L4_M4_20:
  803. INIT4x4
  804. mov pB, origPB
  805. asr counterL, origK, #3 // counterL = counterL / 8
  806. cmp counterL, #0
  807. ble .Lsgemm_kernel_L4_M4_40
  808. .Lsgemm_kernel_L4_M4_22:
  809. KERNEL4x4_SUB
  810. KERNEL4x4_SUB
  811. KERNEL4x4_SUB
  812. KERNEL4x4_SUB
  813. KERNEL4x4_SUB
  814. KERNEL4x4_SUB
  815. KERNEL4x4_SUB
  816. KERNEL4x4_SUB
  817. subs counterL, counterL, #1
  818. bgt .Lsgemm_kernel_L4_M4_22
  819. .Lsgemm_kernel_L4_M4_40:
  820. ands counterL , origK, #7 // counterL = counterL % 8
  821. ble .Lsgemm_kernel_L4_M4_100
  822. .Lsgemm_kernel_L4_M4_42:
  823. KERNEL4x4_SUB
  824. subs counterL, counterL, #1
  825. bgt .Lsgemm_kernel_L4_M4_42
  826. .Lsgemm_kernel_L4_M4_100:
  827. SAVE4x4
  828. .Lsgemm_kernel_L4_M4_END:
  829. .Lsgemm_kernel_L4_M2_BEGIN:
  830. mov counterI, origM
  831. tst counterI , #3
  832. ble .Lsgemm_kernel_L4_END
  833. tst counterI, #2 // counterI = counterI / 2
  834. ble .Lsgemm_kernel_L4_M1_BEGIN
  835. .Lsgemm_kernel_L4_M2_20:
  836. INIT2x4
  837. mov pB, origPB
  838. asr counterL , origK, #3 // counterL = counterL / 8
  839. cmp counterL , #0
  840. ble .Lsgemm_kernel_L4_M2_40
  841. .Lsgemm_kernel_L4_M2_22:
  842. KERNEL2x4_SUB
  843. KERNEL2x4_SUB
  844. KERNEL2x4_SUB
  845. KERNEL2x4_SUB
  846. KERNEL2x4_SUB
  847. KERNEL2x4_SUB
  848. KERNEL2x4_SUB
  849. KERNEL2x4_SUB
  850. subs counterL, counterL, #1
  851. bgt .Lsgemm_kernel_L4_M2_22
  852. .Lsgemm_kernel_L4_M2_40:
  853. ands counterL , origK, #7 // counterL = counterL % 8
  854. ble .Lsgemm_kernel_L4_M2_100
  855. .Lsgemm_kernel_L4_M2_42:
  856. KERNEL2x4_SUB
  857. subs counterL, counterL, #1
  858. bgt .Lsgemm_kernel_L4_M2_42
  859. .Lsgemm_kernel_L4_M2_100:
  860. SAVE2x4
  861. .Lsgemm_kernel_L4_M2_END:
  862. .Lsgemm_kernel_L4_M1_BEGIN:
  863. tst counterI, #1 // counterI = counterI % 2
  864. ble .Lsgemm_kernel_L4_END
  865. .Lsgemm_kernel_L4_M1_20:
  866. INIT1x4
  867. mov pB, origPB
  868. asr counterL , origK, #3 // counterL = counterL / 8
  869. cmp counterL , #0
  870. ble .Lsgemm_kernel_L4_M1_40
  871. .Lsgemm_kernel_L4_M1_22:
  872. KERNEL1x4_SUB
  873. KERNEL1x4_SUB
  874. KERNEL1x4_SUB
  875. KERNEL1x4_SUB
  876. KERNEL1x4_SUB
  877. KERNEL1x4_SUB
  878. KERNEL1x4_SUB
  879. KERNEL1x4_SUB
  880. subs counterL, counterL, #1
  881. bgt .Lsgemm_kernel_L4_M1_22
  882. .Lsgemm_kernel_L4_M1_40:
  883. ands counterL , origK, #7 // counterL = counterL % 8
  884. ble .Lsgemm_kernel_L4_M1_100
  885. .Lsgemm_kernel_L4_M1_42:
  886. KERNEL1x4_SUB
  887. subs counterL, counterL, #1
  888. bgt .Lsgemm_kernel_L4_M1_42
  889. .Lsgemm_kernel_L4_M1_100:
  890. SAVE1x4
  891. .Lsgemm_kernel_L4_END:
  892. lsl temp, origK, #4
  893. add origPB, origPB, temp // B = B + K * 4 * 4
  894. subs counterJ, counterJ , #1 // j--
  895. bgt .Lsgemm_kernel_L4_BEGIN
  896. /******************************************************************************/
  897. .Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  898. mov counterJ , origN
  899. tst counterJ , #3
  900. ble .Lsgemm_kernel_L999
  901. tst counterJ , #2
  902. ble .Lsgemm_kernel_L1_BEGIN
  903. mov pCRow0, pC // pCRow0 = pC
  904. add pC,pC,LDC, lsl #1
  905. mov pA_0, origPA // pA_0 = A
  906. .Lsgemm_kernel_L2_M4_BEGIN:
  907. mov counterI, origM
  908. asr counterI, counterI, #2 // counterI = counterI / 4
  909. cmp counterI,#0
  910. ble .Lsgemm_kernel_L2_M2_BEGIN
  911. .Lsgemm_kernel_L2_M4_20:
  912. INIT4x2
  913. mov pB, origPB
  914. asr counterL , origK, #3 // counterL = counterL / 8
  915. cmp counterL,#0
  916. ble .Lsgemm_kernel_L2_M4_40
  917. .align 5
  918. .Lsgemm_kernel_L2_M4_22:
  919. KERNEL4x2_SUB
  920. KERNEL4x2_SUB
  921. KERNEL4x2_SUB
  922. KERNEL4x2_SUB
  923. KERNEL4x2_SUB
  924. KERNEL4x2_SUB
  925. KERNEL4x2_SUB
  926. KERNEL4x2_SUB
  927. subs counterL, counterL, #1
  928. bgt .Lsgemm_kernel_L2_M4_22
  929. .Lsgemm_kernel_L2_M4_40:
  930. ands counterL , origK, #7 // counterL = counterL % 8
  931. ble .Lsgemm_kernel_L2_M4_100
  932. .Lsgemm_kernel_L2_M4_42:
  933. KERNEL4x2_SUB
  934. subs counterL, counterL, #1
  935. bgt .Lsgemm_kernel_L2_M4_42
  936. .Lsgemm_kernel_L2_M4_100:
  937. SAVE4x2
  938. .Lsgemm_kernel_L2_M4_END:
  939. subs counterI, counterI, #1
  940. bgt .Lsgemm_kernel_L2_M4_20
  941. .Lsgemm_kernel_L2_M2_BEGIN:
  942. mov counterI, origM
  943. tst counterI , #3
  944. ble .Lsgemm_kernel_L2_END
  945. tst counterI, #2 // counterI = counterI / 2
  946. ble .Lsgemm_kernel_L2_M1_BEGIN
  947. .Lsgemm_kernel_L2_M2_20:
  948. INIT2x2
  949. mov pB, origPB
  950. asr counterL , origK, #3 // counterL = counterL / 8
  951. cmp counterL,#0
  952. ble .Lsgemm_kernel_L2_M2_40
  953. .Lsgemm_kernel_L2_M2_22:
  954. KERNEL2x2_SUB
  955. KERNEL2x2_SUB
  956. KERNEL2x2_SUB
  957. KERNEL2x2_SUB
  958. KERNEL2x2_SUB
  959. KERNEL2x2_SUB
  960. KERNEL2x2_SUB
  961. KERNEL2x2_SUB
  962. subs counterL, counterL, #1
  963. bgt .Lsgemm_kernel_L2_M2_22
  964. .Lsgemm_kernel_L2_M2_40:
  965. ands counterL , origK, #7 // counterL = counterL % 8
  966. ble .Lsgemm_kernel_L2_M2_100
  967. .Lsgemm_kernel_L2_M2_42:
  968. KERNEL2x2_SUB
  969. subs counterL, counterL, #1
  970. bgt .Lsgemm_kernel_L2_M2_42
  971. .Lsgemm_kernel_L2_M2_100:
  972. SAVE2x2
  973. .Lsgemm_kernel_L2_M2_END:
  974. .Lsgemm_kernel_L2_M1_BEGIN:
  975. tst counterI, #1 // counterI = counterI % 2
  976. ble .Lsgemm_kernel_L2_END
  977. .Lsgemm_kernel_L2_M1_20:
  978. INIT1x2
  979. mov pB, origPB
  980. asr counterL , origK, #3 // counterL = counterL / 8
  981. cmp counterL, #0
  982. ble .Lsgemm_kernel_L2_M1_40
  983. .Lsgemm_kernel_L2_M1_22:
  984. KERNEL1x2_SUB
  985. KERNEL1x2_SUB
  986. KERNEL1x2_SUB
  987. KERNEL1x2_SUB
  988. KERNEL1x2_SUB
  989. KERNEL1x2_SUB
  990. KERNEL1x2_SUB
  991. KERNEL1x2_SUB
  992. subs counterL, counterL, #1
  993. bgt .Lsgemm_kernel_L2_M1_22
  994. .Lsgemm_kernel_L2_M1_40:
  995. ands counterL , origK, #7 // counterL = counterL % 8
  996. ble .Lsgemm_kernel_L2_M1_100
  997. .Lsgemm_kernel_L2_M1_42:
  998. KERNEL1x2_SUB
  999. subs counterL, counterL, #1
  1000. bgt .Lsgemm_kernel_L2_M1_42
  1001. .Lsgemm_kernel_L2_M1_100:
  1002. SAVE1x2
  1003. .Lsgemm_kernel_L2_END:
  1004. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1005. /******************************************************************************/
  1006. .Lsgemm_kernel_L1_BEGIN:
  1007. mov counterJ , origN
  1008. tst counterJ , #1
  1009. ble .Lsgemm_kernel_L999 // done
  1010. mov pCRow0, pC // pCRow0 = C
  1011. add pC , pC , LDC // Update pC to point to next
  1012. mov pA_0, origPA // pA_0 = A
  1013. .Lsgemm_kernel_L1_M4_BEGIN:
  1014. mov counterI, origM
  1015. asr counterI, counterI, #2 // counterI = counterI / 4
  1016. cmp counterI, #0
  1017. ble .Lsgemm_kernel_L1_M2_BEGIN
  1018. .Lsgemm_kernel_L1_M4_20:
  1019. INIT4x1
  1020. mov pB, origPB
  1021. asr counterL , origK, #3 // counterL = counterL / 8
  1022. cmp counterL , #0
  1023. ble .Lsgemm_kernel_L1_M4_40
  1024. .align 5
  1025. .Lsgemm_kernel_L1_M4_22:
  1026. KERNEL4x1_SUB
  1027. KERNEL4x1_SUB
  1028. KERNEL4x1_SUB
  1029. KERNEL4x1_SUB
  1030. KERNEL4x1_SUB
  1031. KERNEL4x1_SUB
  1032. KERNEL4x1_SUB
  1033. KERNEL4x1_SUB
  1034. subs counterL, counterL, #1
  1035. bgt .Lsgemm_kernel_L1_M4_22
  1036. .Lsgemm_kernel_L1_M4_40:
  1037. ands counterL , origK, #7 // counterL = counterL % 8
  1038. ble .Lsgemm_kernel_L1_M4_100
  1039. .Lsgemm_kernel_L1_M4_42:
  1040. KERNEL4x1_SUB
  1041. subs counterL, counterL, #1
  1042. bgt .Lsgemm_kernel_L1_M4_42
  1043. .Lsgemm_kernel_L1_M4_100:
  1044. SAVE4x1
  1045. .Lsgemm_kernel_L1_M4_END:
  1046. subs counterI, counterI, #1
  1047. bgt .Lsgemm_kernel_L1_M4_20
  1048. .Lsgemm_kernel_L1_M2_BEGIN:
  1049. mov counterI, origM
  1050. tst counterI , #3
  1051. ble .Lsgemm_kernel_L1_END
  1052. tst counterI, #2 // counterI = counterI / 2
  1053. ble .Lsgemm_kernel_L1_M1_BEGIN
  1054. .Lsgemm_kernel_L1_M2_20:
  1055. INIT2x1
  1056. mov pB, origPB
  1057. asr counterL , origK, #3 // counterL = counterL / 8
  1058. cmp counterL , #0
  1059. ble .Lsgemm_kernel_L1_M2_40
  1060. .Lsgemm_kernel_L1_M2_22:
  1061. KERNEL2x1_SUB
  1062. KERNEL2x1_SUB
  1063. KERNEL2x1_SUB
  1064. KERNEL2x1_SUB
  1065. KERNEL2x1_SUB
  1066. KERNEL2x1_SUB
  1067. KERNEL2x1_SUB
  1068. KERNEL2x1_SUB
  1069. subs counterL, counterL, #1
  1070. bgt .Lsgemm_kernel_L1_M2_22
  1071. .Lsgemm_kernel_L1_M2_40:
  1072. ands counterL , origK, #7 // counterL = counterL % 8
  1073. ble .Lsgemm_kernel_L1_M2_100
  1074. .Lsgemm_kernel_L1_M2_42:
  1075. KERNEL2x1_SUB
  1076. subs counterL, counterL, #1
  1077. bgt .Lsgemm_kernel_L1_M2_42
  1078. .Lsgemm_kernel_L1_M2_100:
  1079. SAVE2x1
  1080. .Lsgemm_kernel_L1_M2_END:
  1081. .Lsgemm_kernel_L1_M1_BEGIN:
  1082. tst counterI, #1 // counterI = counterI % 2
  1083. ble .Lsgemm_kernel_L1_END
  1084. .Lsgemm_kernel_L1_M1_20:
  1085. INIT1x1
  1086. mov pB, origPB
  1087. asr counterL , origK, #3 // counterL = counterL / 8
  1088. cmp counterL , #0
  1089. ble .Lsgemm_kernel_L1_M1_40
  1090. .Lsgemm_kernel_L1_M1_22:
  1091. KERNEL1x1_SUB
  1092. KERNEL1x1_SUB
  1093. KERNEL1x1_SUB
  1094. KERNEL1x1_SUB
  1095. KERNEL1x1_SUB
  1096. KERNEL1x1_SUB
  1097. KERNEL1x1_SUB
  1098. KERNEL1x1_SUB
  1099. subs counterL, counterL, #1
  1100. bgt .Lsgemm_kernel_L1_M1_22
  1101. .Lsgemm_kernel_L1_M1_40:
  1102. ands counterL , origK, #7 // counterL = counterL % 8
  1103. ble .Lsgemm_kernel_L1_M1_100
  1104. .Lsgemm_kernel_L1_M1_42:
  1105. KERNEL1x1_SUB
  1106. subs counterL, counterL, #1
  1107. bgt .Lsgemm_kernel_L1_M1_42
  1108. .Lsgemm_kernel_L1_M1_100:
  1109. SAVE1x1
  1110. .Lsgemm_kernel_L1_END:
  1111. .Lsgemm_kernel_L999:
  1112. mov x0, #0 // set return value
  1113. ldp d8, d9, [sp, #(0 * 16)]
  1114. ldp d10, d11, [sp, #(1 * 16)]
  1115. ldp d12, d13, [sp, #(2 * 16)]
  1116. ldp d14, d15, [sp, #(3 * 16)]
  1117. ldp d16, d17, [sp, #(4 * 16)]
  1118. ldp x18, x19, [sp, #(5 * 16)]
  1119. ldp x20, x21, [sp, #(6 * 16)]
  1120. ldp x22, x23, [sp, #(7 * 16)]
  1121. ldp x24, x25, [sp, #(8 * 16)]
  1122. ldp x26, x27, [sp, #(9 * 16)]
  1123. ldr x28, [sp, #(10 * 16)]
  1124. add sp, sp, #(11*16)
  1125. ret
  1126. EPILOGUE