You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_4x4.S 35 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define ppC x16
  48. #define ppA x17
  49. #define alpha0_R s10
  50. #define alphaV0_R v10.s[0]
  51. #define alpha0_I s11
  52. #define alphaV0_I v11.s[0]
  53. #define alpha1_R s14
  54. #define alphaV1_R v14.s[0]
  55. #define alpha1_I s15
  56. #define alphaV1_I v15.s[0]
  57. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  58. #define OP_rr fmla
  59. #define OP_ii fmls
  60. #define OP_ri fmla
  61. #define OP_ir fmla
  62. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  63. #define OP_rr fmla
  64. #define OP_ii fmla
  65. #define OP_ri fmls
  66. #define OP_ir fmla
  67. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  68. #define OP_rr fmla
  69. #define OP_ii fmla
  70. #define OP_ri fmla
  71. #define OP_ir fmls
  72. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  73. #define OP_rr fmla
  74. #define OP_ii fmls
  75. #define OP_ri fmls
  76. #define OP_ir fmls
  77. #endif
  78. // 00 origM
  79. // 01 origN
  80. // 02 origK
  81. // 03 origPA
  82. // 04 origPB
  83. // 05 pC
  84. // 06 origLDC -> LDC
  85. // 07 offset -> temp
  86. // 08 counterL
  87. // 09 counterI
  88. // 10 counterJ
  89. // 11 pB
  90. // 12 pCRow0
  91. // 13 pCRow1
  92. // 14 pCRow2
  93. // 15 pA
  94. // 16 ppC
  95. // 17 ppA
  96. // 18 must save
  97. // 19 must save
  98. // 20 must save
  99. // 21 must save
  100. // 22 must save
  101. // 23 must save
  102. // 24 must save
  103. // 25 must save
  104. // 26 must save
  105. // 27 must save
  106. // 28 must save
  107. // 29 frame
  108. // 30 link
  109. // 31 sp
  110. //v00 ALPHA_R -> pA00_R, pA01_R, pA02_R, pA03_R
  111. //v01 ALPHA_I -> pA00_I, pA01_I, pA02_I, pA03_I
  112. //v02 ppA00_R, ppA01_R, ppA02_R, ppA03_R
  113. //v03 ppA00_I, ppA01_I, ppA02_I, ppA03_I
  114. //v04 pA10_R, pA11_R, pA12_R, pA13_R
  115. //v05 pA10_I, pA11_I, pA12_I, pA13_I
  116. //v06 ppA10_R, ppA11_R, ppA12_R, ppA13_R
  117. //v07 ppA10_I, ppA11_I, ppA12_I, ppA13_I
  118. //v08 must save pB00_R, pB01_R, pB02_R, pB03_R
  119. //v09 must save pB00_I, pB01_I, pB02_I, pB03_I
  120. //v10 must save ALPHA0_R
  121. //v11 must save ALPHA0_I
  122. //v12 must save pB10_R, pB11_R, pB12_R, pB13_R
  123. //v13 must save pB10_I, pB11_I, pB12_I, pB13_I
  124. //v14 must save ALPHA1_R
  125. //v15 must save ALPHA1_I
  126. //v16 must save pC00_R, pC01_R, pC02_R, pC03_R
  127. //v17 must save pC00_I, pC01_I, pC02_I, pC03_I
  128. //v18 ppC00_R, ppC01_R, ppC02_R, ppC03_R
  129. //v19 ppC00_I, ppC01_I, ppC02_I, ppC03_I
  130. //v20 pC10_R, pC11_R, pC12_R, pC13_R
  131. //v21 pC10_I, pC11_I, pC12_I, pC13_I
  132. //v22 ppC10_R, ppC11_R, ppC12_R, ppC13_R
  133. //v23 ppC10_I, ppC11_I, ppC12_I, ppC13_I
  134. //v24 pC20_R, pC21_R, pC22_R, pC23_R
  135. //v25 pC20_I, pC21_I, pC22_I, pC23_I
  136. //v26 ppC20_R, ppC21_R, ppC22_R, ppC23_R
  137. //v27 ppC20_I, ppC21_I, ppC22_I, ppC23_I
  138. //v28 pC30_R, pC31_R, pC32_R, pC33_R
  139. //v29 pC30_I, pC31_I, pC32_I, pC33_I
  140. //v30 ppC30_R, ppC31_R, ppC32_R, ppC33_R
  141. //v31 ppC30_I, ppC31_I, ppC32_I, ppC33_I
  142. /*******************************************************************************
  143. * Macro definitions
  144. *******************************************************************************/
  145. .macro INIT8x4
  146. fmov s16, wzr
  147. fmov s17, s16
  148. fmov s18, s17
  149. fmov s19, s16
  150. fmov s20, s17
  151. fmov s21, s16
  152. fmov s22, s17
  153. fmov s23, s16
  154. fmov s24, s17
  155. fmov s25, s16
  156. fmov s26, s17
  157. fmov s27, s16
  158. fmov s28, s17
  159. fmov s29, s16
  160. fmov s30, s17
  161. fmov s31, s16
  162. .endm
  163. .macro KERNEL8x4_I
  164. ld2 {v8.4s, v9.4s}, [pB]
  165. add pB, pB, #32
  166. ld2 {v0.4s, v1.4s}, [pA]
  167. add pA, pA, #32
  168. ld2 {v2.4s, v3.4s}, [ppA]
  169. add ppA, ppA, #32
  170. fmul v16.4s, v0.4s, v8.s[0]
  171. OP_ii v16.4s, v1.4s, v9.s[0]
  172. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  173. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  174. eor v17.16b, v17.16b, v17.16b
  175. fmls v17.4s, v0.4s, v9.s[0]
  176. #else
  177. fmul v17.4s, v0.4s, v9.s[0]
  178. #endif
  179. OP_ir v17.4s, v1.4s, v8.s[0]
  180. fmul v20.4s, v0.4s, v8.s[1]
  181. OP_ii v20.4s, v1.4s, v9.s[1]
  182. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  183. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  184. eor v21.16b, v21.16b, v21.16b
  185. fmls v21.4s, v0.4s, v9.s[1]
  186. #else
  187. fmul v21.4s, v0.4s, v9.s[1]
  188. #endif
  189. OP_ir v21.4s, v1.4s, v8.s[1]
  190. fmul v24.4s, v0.4s, v8.s[2]
  191. OP_ii v24.4s, v1.4s, v9.s[2]
  192. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  193. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  194. eor v25.16b, v25.16b, v25.16b
  195. fmls v25.4s, v0.4s, v9.s[2]
  196. #else
  197. fmul v25.4s, v0.4s, v9.s[2]
  198. #endif
  199. OP_ir v25.4s, v1.4s, v8.s[2]
  200. fmul v28.4s, v0.4s, v8.s[3]
  201. OP_ii v28.4s, v1.4s, v9.s[3]
  202. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  203. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  204. eor v29.16b, v29.16b, v29.16b
  205. fmls v29.4s, v0.4s, v9.s[3]
  206. #else
  207. fmul v29.4s, v0.4s, v9.s[3]
  208. #endif
  209. OP_ir v29.4s, v1.4s, v8.s[3]
  210. fmul v18.4s, v2.4s, v8.s[0]
  211. OP_ii v18.4s, v3.4s, v9.s[0]
  212. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  213. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  214. eor v19.16b, v19.16b, v19.16b
  215. fmls v19.4s, v2.4s, v9.s[0]
  216. #else
  217. fmul v19.4s, v2.4s, v9.s[0]
  218. #endif
  219. OP_ir v19.4s, v3.4s, v8.s[0]
  220. fmul v22.4s, v2.4s, v8.s[1]
  221. OP_ii v22.4s, v3.4s, v9.s[1]
  222. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  223. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  224. eor v23.16b, v23.16b, v23.16b
  225. fmls v23.4s, v2.4s, v9.s[1]
  226. #else
  227. fmul v23.4s, v2.4s, v9.s[1]
  228. #endif
  229. OP_ir v23.4s, v3.4s, v8.s[1]
  230. fmul v26.4s, v2.4s, v8.s[2]
  231. OP_ii v26.4s, v3.4s, v9.s[2]
  232. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  233. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  234. eor v27.16b, v27.16b, v27.16b
  235. fmls v27.4s, v2.4s, v9.s[2]
  236. #else
  237. fmul v27.4s, v2.4s, v9.s[2]
  238. #endif
  239. OP_ir v27.4s, v3.4s, v8.s[2]
  240. fmul v30.4s, v2.4s, v8.s[3]
  241. OP_ii v30.4s, v3.4s, v9.s[3]
  242. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  243. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  244. eor v31.16b, v31.16b, v31.16b
  245. fmls v31.4s, v2.4s, v9.s[3]
  246. #else
  247. fmul v31.4s, v2.4s, v9.s[3]
  248. #endif
  249. OP_ir v31.4s, v3.4s, v8.s[3]
  250. ld2 {v12.4s, v13.4s}, [pB]
  251. add pB, pB, #32
  252. ld2 {v4.4s, v5.4s} , [pA]
  253. add pA, pA, #32
  254. ld2 {v6.4s, v7.4s} , [ppA]
  255. add ppA, ppA, #32
  256. .endm
  257. .macro KERNEL8x4_M1
  258. OP_rr v16.4s, v0.4s, v8.s[0]
  259. OP_ii v16.4s, v1.4s, v9.s[0]
  260. OP_ri v17.4s, v0.4s, v9.s[0]
  261. OP_ir v17.4s, v1.4s, v8.s[0]
  262. ld2 {v12.4s, v13.4s}, [pB] // for next round
  263. add pB, pB, #32
  264. OP_rr v20.4s, v0.4s, v8.s[1]
  265. OP_ii v20.4s, v1.4s, v9.s[1]
  266. OP_ri v21.4s, v0.4s, v9.s[1]
  267. OP_ir v21.4s, v1.4s, v8.s[1]
  268. prfm PLDL1KEEP, [pB, #512]
  269. OP_rr v24.4s, v0.4s, v8.s[2]
  270. OP_ii v24.4s, v1.4s, v9.s[2]
  271. OP_ri v25.4s, v0.4s, v9.s[2]
  272. OP_ir v25.4s, v1.4s, v8.s[2]
  273. ld2 {v4.4s, v5.4s} , [pA] // for next round
  274. add pA, pA, #32
  275. OP_rr v28.4s, v0.4s, v8.s[3]
  276. OP_ii v28.4s, v1.4s, v9.s[3]
  277. OP_ri v29.4s, v0.4s, v9.s[3]
  278. OP_ir v29.4s, v1.4s, v8.s[3]
  279. prfm PLDL1KEEP, [pA, #512]
  280. OP_rr v18.4s, v2.4s, v8.s[0]
  281. OP_ii v18.4s, v3.4s, v9.s[0]
  282. OP_ri v19.4s, v2.4s, v9.s[0]
  283. OP_ir v19.4s, v3.4s, v8.s[0]
  284. ld2 {v6.4s, v7.4s} , [ppA] // for next round
  285. add ppA, ppA, #32
  286. OP_rr v22.4s, v2.4s, v8.s[1]
  287. OP_ii v22.4s, v3.4s, v9.s[1]
  288. OP_ri v23.4s, v2.4s, v9.s[1]
  289. OP_ir v23.4s, v3.4s, v8.s[1]
  290. prfm PLDL1KEEP, [ppA, #512]
  291. OP_rr v26.4s, v2.4s, v8.s[2]
  292. OP_ii v26.4s, v3.4s, v9.s[2]
  293. OP_ri v27.4s, v2.4s, v9.s[2]
  294. OP_ir v27.4s, v3.4s, v8.s[2]
  295. OP_rr v30.4s, v2.4s, v8.s[3]
  296. OP_ii v30.4s, v3.4s, v9.s[3]
  297. OP_ri v31.4s, v2.4s, v9.s[3]
  298. OP_ir v31.4s, v3.4s, v8.s[3]
  299. .endm
  300. .macro KERNEL8x4_M2
  301. OP_rr v16.4s, v4.4s, v12.s[0]
  302. OP_ii v16.4s, v5.4s, v13.s[0]
  303. OP_ri v17.4s, v4.4s, v13.s[0]
  304. OP_ir v17.4s, v5.4s, v12.s[0]
  305. ld2 {v8.4s, v9.4s}, [pB] // for next round
  306. add pB, pB, #32
  307. OP_rr v20.4s, v4.4s, v12.s[1]
  308. OP_ii v20.4s, v5.4s, v13.s[1]
  309. OP_ri v21.4s, v4.4s, v13.s[1]
  310. OP_ir v21.4s, v5.4s, v12.s[1]
  311. prfm PLDL1KEEP, [pA, #512]
  312. OP_rr v24.4s, v4.4s, v12.s[2]
  313. OP_ii v24.4s, v5.4s, v13.s[2]
  314. OP_ri v25.4s, v4.4s, v13.s[2]
  315. OP_ir v25.4s, v5.4s, v12.s[2]
  316. ld2 {v0.4s, v1.4s}, [pA] // for next round
  317. add pA, pA, #32
  318. OP_rr v28.4s, v4.4s, v12.s[3]
  319. OP_ii v28.4s, v5.4s, v13.s[3]
  320. OP_ri v29.4s, v4.4s, v13.s[3]
  321. OP_ir v29.4s, v5.4s, v12.s[3]
  322. prfm PLDL1KEEP, [ppA, #512]
  323. OP_rr v18.4s, v6.4s, v12.s[0]
  324. OP_ii v18.4s, v7.4s, v13.s[0]
  325. OP_ri v19.4s, v6.4s, v13.s[0]
  326. OP_ir v19.4s, v7.4s, v12.s[0]
  327. ld2 {v2.4s, v3.4s}, [ppA] // for next round
  328. add ppA, ppA, #32
  329. OP_rr v22.4s, v6.4s, v12.s[1]
  330. OP_ii v22.4s, v7.4s, v13.s[1]
  331. OP_ri v23.4s, v6.4s, v13.s[1]
  332. OP_ir v23.4s, v7.4s, v12.s[1]
  333. prfm PLDL1KEEP, [pB, #512]
  334. OP_rr v26.4s, v6.4s, v12.s[2]
  335. OP_ii v26.4s, v7.4s, v13.s[2]
  336. OP_ri v27.4s, v6.4s, v13.s[2]
  337. OP_ir v27.4s, v7.4s, v12.s[2]
  338. OP_rr v30.4s, v6.4s, v12.s[3]
  339. OP_ii v30.4s, v7.4s, v13.s[3]
  340. OP_ri v31.4s, v6.4s, v13.s[3]
  341. OP_ir v31.4s, v7.4s, v12.s[3]
  342. .endm
  343. .macro KERNEL8x4_E
  344. OP_rr v16.4s, v4.4s, v12.s[0]
  345. OP_ii v16.4s, v5.4s, v13.s[0]
  346. OP_ri v17.4s, v4.4s, v13.s[0]
  347. OP_ir v17.4s, v5.4s, v12.s[0]
  348. OP_rr v20.4s, v4.4s, v12.s[1]
  349. OP_ii v20.4s, v5.4s, v13.s[1]
  350. OP_ri v21.4s, v4.4s, v13.s[1]
  351. OP_ir v21.4s, v5.4s, v12.s[1]
  352. OP_rr v24.4s, v4.4s, v12.s[2]
  353. OP_ii v24.4s, v5.4s, v13.s[2]
  354. OP_ri v25.4s, v4.4s, v13.s[2]
  355. OP_ir v25.4s, v5.4s, v12.s[2]
  356. OP_rr v28.4s, v4.4s, v12.s[3]
  357. OP_ii v28.4s, v5.4s, v13.s[3]
  358. OP_ri v29.4s, v4.4s, v13.s[3]
  359. OP_ir v29.4s, v5.4s, v12.s[3]
  360. OP_rr v18.4s, v6.4s, v12.s[0]
  361. OP_ii v18.4s, v7.4s, v13.s[0]
  362. OP_ri v19.4s, v6.4s, v13.s[0]
  363. OP_ir v19.4s, v7.4s, v12.s[0]
  364. OP_rr v22.4s, v6.4s, v12.s[1]
  365. OP_ii v22.4s, v7.4s, v13.s[1]
  366. OP_ri v23.4s, v6.4s, v13.s[1]
  367. OP_ir v23.4s, v7.4s, v12.s[1]
  368. OP_rr v26.4s, v6.4s, v12.s[2]
  369. OP_ii v26.4s, v7.4s, v13.s[2]
  370. OP_ri v27.4s, v6.4s, v13.s[2]
  371. OP_ir v27.4s, v7.4s, v12.s[2]
  372. OP_rr v30.4s, v6.4s, v12.s[3]
  373. OP_ii v30.4s, v7.4s, v13.s[3]
  374. OP_ri v31.4s, v6.4s, v13.s[3]
  375. OP_ir v31.4s, v7.4s, v12.s[3]
  376. .endm
  377. .macro KERNEL8x4_SUB
  378. ld2 {v8.4s, v9.4s}, [pB]
  379. add pB, pB, #32
  380. ld2 {v0.4s, v1.4s}, [pA]
  381. add pA, pA, #32
  382. OP_rr v16.4s, v0.4s, v8.s[0]
  383. OP_ii v16.4s, v1.4s, v9.s[0]
  384. OP_ri v17.4s, v0.4s, v9.s[0]
  385. OP_ir v17.4s, v1.4s, v8.s[0]
  386. OP_rr v20.4s, v0.4s, v8.s[1]
  387. OP_ii v20.4s, v1.4s, v9.s[1]
  388. OP_ri v21.4s, v0.4s, v9.s[1]
  389. OP_ir v21.4s, v1.4s, v8.s[1]
  390. ld2 {v2.4s, v3.4s}, [ppA]
  391. add ppA, ppA, #32
  392. OP_rr v24.4s, v0.4s, v8.s[2]
  393. OP_ii v24.4s, v1.4s, v9.s[2]
  394. OP_ri v25.4s, v0.4s, v9.s[2]
  395. OP_ir v25.4s, v1.4s, v8.s[2]
  396. OP_rr v28.4s, v0.4s, v8.s[3]
  397. OP_ii v28.4s, v1.4s, v9.s[3]
  398. OP_ri v29.4s, v0.4s, v9.s[3]
  399. OP_ir v29.4s, v1.4s, v8.s[3]
  400. OP_rr v18.4s, v2.4s, v8.s[0]
  401. OP_ii v18.4s, v3.4s, v9.s[0]
  402. OP_ri v19.4s, v2.4s, v9.s[0]
  403. OP_ir v19.4s, v3.4s, v8.s[0]
  404. OP_rr v22.4s, v2.4s, v8.s[1]
  405. OP_ii v22.4s, v3.4s, v9.s[1]
  406. OP_ri v23.4s, v2.4s, v9.s[1]
  407. OP_ir v23.4s, v3.4s, v8.s[1]
  408. OP_rr v26.4s, v2.4s, v8.s[2]
  409. OP_ii v26.4s, v3.4s, v9.s[2]
  410. OP_ri v27.4s, v2.4s, v9.s[2]
  411. OP_ir v27.4s, v3.4s, v8.s[2]
  412. OP_rr v30.4s, v2.4s, v8.s[3]
  413. OP_ii v30.4s, v3.4s, v9.s[3]
  414. OP_ri v31.4s, v2.4s, v9.s[3]
  415. OP_ir v31.4s, v3.4s, v8.s[3]
  416. .endm
  417. .macro SAVE8x4
  418. mov pCRow1, pCRow0
  419. add pCRow2, pCRow1, #32
  420. ld2 {v0.4s, v1.4s}, [pCRow1]
  421. fmla v0.4s, v16.4s, alphaV0_R
  422. fmls v0.4s, v17.4s, alphaV0_I
  423. fmla v1.4s, v16.4s, alphaV1_I
  424. fmla v1.4s, v17.4s, alphaV1_R
  425. st2 {v0.4s, v1.4s}, [pCRow1]
  426. add pCRow1, pCRow1, LDC
  427. ld2 {v2.4s, v3.4s}, [pCRow2]
  428. fmla v2.4s, v18.4s, alphaV0_R
  429. fmls v2.4s, v19.4s, alphaV0_I
  430. fmla v3.4s, v18.4s, alphaV1_I
  431. fmla v3.4s, v19.4s, alphaV1_R
  432. st2 {v2.4s, v3.4s}, [pCRow2]
  433. add pCRow2, pCRow1, #32
  434. ld2 {v4.4s, v5.4s}, [pCRow1]
  435. fmla v4.4s, v20.4s, alphaV0_R
  436. fmls v4.4s, v21.4s, alphaV0_I
  437. fmla v5.4s, v20.4s, alphaV1_I
  438. fmla v5.4s, v21.4s, alphaV1_R
  439. st2 {v4.4s, v5.4s}, [pCRow1]
  440. add pCRow1, pCRow1, LDC
  441. ld2 {v6.4s, v7.4s}, [pCRow2]
  442. fmla v6.4s, v22.4s, alphaV0_R
  443. fmls v6.4s, v23.4s, alphaV0_I
  444. fmla v7.4s, v22.4s, alphaV1_I
  445. fmla v7.4s, v23.4s, alphaV1_R
  446. st2 {v6.4s, v7.4s}, [pCRow2]
  447. add pCRow2, pCRow1, #32
  448. ld2 {v0.4s, v1.4s}, [pCRow1]
  449. fmla v0.4s, v24.4s, alphaV0_R
  450. fmls v0.4s, v25.4s, alphaV0_I
  451. fmla v1.4s, v24.4s, alphaV1_I
  452. fmla v1.4s, v25.4s, alphaV1_R
  453. st2 {v0.4s, v1.4s}, [pCRow1]
  454. add pCRow1, pCRow1, LDC
  455. ld2 {v2.4s, v3.4s}, [pCRow2]
  456. fmla v2.4s, v26.4s, alphaV0_R
  457. fmls v2.4s, v27.4s, alphaV0_I
  458. fmla v3.4s, v26.4s, alphaV1_I
  459. fmla v3.4s, v27.4s, alphaV1_R
  460. st2 {v2.4s, v3.4s}, [pCRow2]
  461. add pCRow2, pCRow1, #32
  462. ld2 {v4.4s, v5.4s}, [pCRow1]
  463. fmla v4.4s, v28.4s, alphaV0_R
  464. fmls v4.4s, v29.4s, alphaV0_I
  465. fmla v5.4s, v28.4s, alphaV1_I
  466. fmla v5.4s, v29.4s, alphaV1_R
  467. st2 {v4.4s, v5.4s}, [pCRow1]
  468. add pCRow1, pCRow1, LDC
  469. ld2 {v6.4s, v7.4s}, [pCRow2]
  470. fmla v6.4s, v30.4s, alphaV0_R
  471. fmls v6.4s, v31.4s, alphaV0_I
  472. fmla v7.4s, v30.4s, alphaV1_I
  473. fmla v7.4s, v31.4s, alphaV1_R
  474. st2 {v6.4s, v7.4s}, [pCRow2]
  475. add pCRow0, pCRow0, #64
  476. .endm
  477. /******************************************************************************/
  478. .macro INIT4x4
  479. fmov s16, wzr
  480. fmov s17, s16
  481. fmov s20, s17
  482. fmov s21, s16
  483. fmov s24, s17
  484. fmov s25, s16
  485. fmov s28, s17
  486. fmov s29, s16
  487. .endm
  488. .macro KERNEL4x4_SUB
  489. ld2 {v8.4s, v9.4s}, [pB]
  490. add pB, pB, #32
  491. ld2 {v0.4s, v1.4s}, [pA]
  492. add pA, pA, #32
  493. OP_rr v16.4s, v0.4s, v8.s[0]
  494. OP_ii v16.4s, v1.4s, v9.s[0]
  495. OP_ri v17.4s, v0.4s, v9.s[0]
  496. OP_ir v17.4s, v1.4s, v8.s[0]
  497. OP_rr v20.4s, v0.4s, v8.s[1]
  498. OP_ii v20.4s, v1.4s, v9.s[1]
  499. OP_ri v21.4s, v0.4s, v9.s[1]
  500. OP_ir v21.4s, v1.4s, v8.s[1]
  501. OP_rr v24.4s, v0.4s, v8.s[2]
  502. OP_ii v24.4s, v1.4s, v9.s[2]
  503. OP_ri v25.4s, v0.4s, v9.s[2]
  504. OP_ir v25.4s, v1.4s, v8.s[2]
  505. OP_rr v28.4s, v0.4s, v8.s[3]
  506. OP_ii v28.4s, v1.4s, v9.s[3]
  507. OP_ri v29.4s, v0.4s, v9.s[3]
  508. OP_ir v29.4s, v1.4s, v8.s[3]
  509. .endm
  510. .macro SAVE4x4
  511. mov pCRow1, pCRow0
  512. ld2 {v0.4s, v1.4s}, [pCRow1]
  513. fmla v0.4s, v16.4s, alphaV0_R
  514. fmls v0.4s, v17.4s, alphaV0_I
  515. fmla v1.4s, v16.4s, alphaV1_I
  516. fmla v1.4s, v17.4s, alphaV1_R
  517. st2 {v0.4s, v1.4s}, [pCRow1]
  518. add pCRow1, pCRow1, LDC
  519. ld2 {v4.4s, v5.4s}, [pCRow1]
  520. fmla v4.4s, v20.4s, alphaV0_R
  521. fmls v4.4s, v21.4s, alphaV0_I
  522. fmla v5.4s, v20.4s, alphaV1_I
  523. fmla v5.4s, v21.4s, alphaV1_R
  524. st2 {v4.4s, v5.4s}, [pCRow1]
  525. add pCRow1, pCRow1, LDC
  526. ld2 {v0.4s, v1.4s}, [pCRow1]
  527. fmla v0.4s, v24.4s, alphaV0_R
  528. fmls v0.4s, v25.4s, alphaV0_I
  529. fmla v1.4s, v24.4s, alphaV1_I
  530. fmla v1.4s, v25.4s, alphaV1_R
  531. st2 {v0.4s, v1.4s}, [pCRow1]
  532. add pCRow1, pCRow1, LDC
  533. ld2 {v4.4s, v5.4s}, [pCRow1]
  534. fmla v4.4s, v28.4s, alphaV0_R
  535. fmls v4.4s, v29.4s, alphaV0_I
  536. fmla v5.4s, v28.4s, alphaV1_I
  537. fmla v5.4s, v29.4s, alphaV1_R
  538. st2 {v4.4s, v5.4s}, [pCRow1]
  539. add pCRow0, pCRow0, #32
  540. .endm
  541. /******************************************************************************/
  542. .macro INIT2x4
  543. fmov s16, wzr
  544. fmov s17, wzr
  545. fmov s20, s16
  546. fmov s21, s17
  547. fmov s24, s16
  548. fmov s25, s17
  549. fmov s28, s16
  550. fmov s29, s17
  551. .endm
  552. .macro KERNEL2x4_SUB
  553. ld2 {v8.4s, v9.4s}, [pB]
  554. add pB, pB, #32
  555. ld2 {v0.2s, v1.2s}, [pA]
  556. add pA, pA, #16
  557. OP_rr v16.2s, v0.2s, v8.s[0]
  558. OP_ii v16.2s, v1.2s, v9.s[0]
  559. OP_ri v17.2s, v0.2s, v9.s[0]
  560. OP_ir v17.2s, v1.2s, v8.s[0]
  561. OP_rr v20.2s, v0.2s, v8.s[1]
  562. OP_ii v20.2s, v1.2s, v9.s[1]
  563. OP_ri v21.2s, v0.2s, v9.s[1]
  564. OP_ir v21.2s, v1.2s, v8.s[1]
  565. OP_rr v24.2s, v0.2s, v8.s[2]
  566. OP_ii v24.2s, v1.2s, v9.s[2]
  567. OP_ri v25.2s, v0.2s, v9.s[2]
  568. OP_ir v25.2s, v1.2s, v8.s[2]
  569. OP_rr v28.2s, v0.2s, v8.s[3]
  570. OP_ii v28.2s, v1.2s, v9.s[3]
  571. OP_ri v29.2s, v0.2s, v9.s[3]
  572. OP_ir v29.2s, v1.2s, v8.s[3]
  573. .endm
  574. .macro SAVE2x4
  575. mov pCRow1, pCRow0
  576. ld2 {v0.2s, v1.2s}, [pCRow1]
  577. fmla v0.2s, v16.2s, alphaV0_R
  578. fmls v0.2s, v17.2s, alphaV0_I
  579. fmla v1.2s, v16.2s, alphaV1_I
  580. fmla v1.2s, v17.2s, alphaV1_R
  581. st2 {v0.2s, v1.2s}, [pCRow1]
  582. add pCRow1, pCRow1, LDC
  583. ld2 {v4.2s, v5.2s}, [pCRow1]
  584. fmla v4.2s, v20.2s, alphaV0_R
  585. fmls v4.2s, v21.2s, alphaV0_I
  586. fmla v5.2s, v20.2s, alphaV1_I
  587. fmla v5.2s, v21.2s, alphaV1_R
  588. st2 {v4.2s, v5.2s}, [pCRow1]
  589. add pCRow1, pCRow1, LDC
  590. ld2 {v0.2s, v1.2s}, [pCRow1]
  591. fmla v0.2s, v24.2s, alphaV0_R
  592. fmls v0.2s, v25.2s, alphaV0_I
  593. fmla v1.2s, v24.2s, alphaV1_I
  594. fmla v1.2s, v25.2s, alphaV1_R
  595. st2 {v0.2s, v1.2s}, [pCRow1]
  596. add pCRow1, pCRow1, LDC
  597. ld2 {v4.2s, v5.2s}, [pCRow1]
  598. fmla v4.2s, v28.2s, alphaV0_R
  599. fmls v4.2s, v29.2s, alphaV0_I
  600. fmla v5.2s, v28.2s, alphaV1_I
  601. fmla v5.2s, v29.2s, alphaV1_R
  602. st2 {v4.2s, v5.2s}, [pCRow1]
  603. add pCRow0, pCRow0, #16
  604. .endm
  605. /******************************************************************************/
  606. .macro INIT1x4
  607. fmov s16, wzr
  608. fmov s17, wzr
  609. fmov s20, s16
  610. fmov s21, s17
  611. fmov s24, s16
  612. fmov s25, s17
  613. fmov s28, s16
  614. fmov s29, s17
  615. .endm
  616. .macro KERNEL1x4_SUB
  617. ld2 {v8.4s, v9.4s}, [pB]
  618. add pB, pB, #32
  619. ld2 {v0.s, v1.s}[0], [pA]
  620. add pA, pA, #8
  621. OP_rr s16, s0, v8.s[0]
  622. OP_ii s16, s1, v9.s[0]
  623. OP_ri s17, s0, v9.s[0]
  624. OP_ir s17, s1, v8.s[0]
  625. OP_rr s20, s0, v8.s[1]
  626. OP_ii s20, s1, v9.s[1]
  627. OP_ri s21, s0, v9.s[1]
  628. OP_ir s21, s1, v8.s[1]
  629. OP_rr s24, s0, v8.s[2]
  630. OP_ii s24, s1, v9.s[2]
  631. OP_ri s25, s0, v9.s[2]
  632. OP_ir s25, s1, v8.s[2]
  633. OP_rr s28, s0, v8.s[3]
  634. OP_ii s28, s1, v9.s[3]
  635. OP_ri s29, s0, v9.s[3]
  636. OP_ir s29, s1, v8.s[3]
  637. .endm
  638. .macro SAVE1x4
  639. mov pCRow1, pCRow0
  640. ld2 {v0.s, v1.s}[0], [pCRow1]
  641. fmla s0, s16, alphaV0_R
  642. fmls s0, s17, alphaV0_I
  643. fmla s1, s16, alphaV1_I
  644. fmla s1, s17, alphaV1_R
  645. st2 {v0.s, v1.s}[0], [pCRow1]
  646. add pCRow1, pCRow1, LDC
  647. ld2 {v4.s, v5.s}[0], [pCRow1]
  648. fmla s4, s20, alphaV0_R
  649. fmls s4, s21, alphaV0_I
  650. fmla s5, s20, alphaV1_I
  651. fmla s5, s21, alphaV1_R
  652. st2 {v4.s, v5.s}[0], [pCRow1]
  653. add pCRow1, pCRow1, LDC
  654. ld2 {v0.s, v1.s}[0], [pCRow1]
  655. fmla s0, s24, alphaV0_R
  656. fmls s0, s25, alphaV0_I
  657. fmla s1, s24, alphaV1_I
  658. fmla s1, s25, alphaV1_R
  659. st2 {v0.s, v1.s}[0], [pCRow1]
  660. add pCRow1, pCRow1, LDC
  661. ld2 {v4.s, v5.s}[0], [pCRow1]
  662. fmla s4, s28, alphaV0_R
  663. fmls s4, s29, alphaV0_I
  664. fmla s5, s28, alphaV1_I
  665. fmla s5, s29, alphaV1_R
  666. st2 {v4.s, v5.s}[0], [pCRow1]
  667. add pCRow0, pCRow0, #8
  668. .endm
  669. /******************************************************************************/
  670. .macro INIT4x2
  671. fmov s16, wzr
  672. fmov s17, wzr
  673. fmov s20, s16
  674. fmov s21, s17
  675. .endm
  676. .macro KERNEL4x2_SUB
  677. ld2 {v8.2s, v9.2s}, [pB]
  678. add pB, pB, #16
  679. ld2 {v0.4s, v1.4s}, [pA]
  680. add pA, pA, #32
  681. OP_rr v16.4s, v0.4s, v8.s[0]
  682. OP_ii v16.4s, v1.4s, v9.s[0]
  683. OP_ri v17.4s, v0.4s, v9.s[0]
  684. OP_ir v17.4s, v1.4s, v8.s[0]
  685. OP_rr v20.4s, v0.4s, v8.s[1]
  686. OP_ii v20.4s, v1.4s, v9.s[1]
  687. OP_ri v21.4s, v0.4s, v9.s[1]
  688. OP_ir v21.4s, v1.4s, v8.s[1]
  689. .endm
  690. .macro SAVE4x2
  691. mov pCRow1, pCRow0
  692. ld2 {v0.4s, v1.4s}, [pCRow1]
  693. fmla v0.4s, v16.4s, alphaV0_R
  694. fmls v0.4s, v17.4s, alphaV0_I
  695. fmla v1.4s, v16.4s, alphaV1_I
  696. fmla v1.4s, v17.4s, alphaV1_R
  697. st2 {v0.4s, v1.4s}, [pCRow1]
  698. add pCRow1, pCRow1, LDC
  699. ld2 {v4.4s, v5.4s}, [pCRow1]
  700. fmla v4.4s, v20.4s, alphaV0_R
  701. fmls v4.4s, v21.4s, alphaV0_I
  702. fmla v5.4s, v20.4s, alphaV1_I
  703. fmla v5.4s, v21.4s, alphaV1_R
  704. st2 {v4.4s, v5.4s}, [pCRow1]
  705. add pCRow0, pCRow0, #32
  706. .endm
  707. /******************************************************************************/
  708. .macro INIT2x2
  709. fmov s16, wzr
  710. fmov s17, wzr
  711. fmov s20, s16
  712. fmov s21, s17
  713. .endm
  714. .macro KERNEL2x2_SUB
  715. ld2 {v8.2s, v9.2s}, [pB]
  716. add pB, pB, #16
  717. ld2 {v0.2s, v1.2s}, [pA]
  718. add pA, pA, #16
  719. OP_rr v16.2s, v0.2s, v8.s[0]
  720. OP_ii v16.2s, v1.2s, v9.s[0]
  721. OP_ri v17.2s, v0.2s, v9.s[0]
  722. OP_ir v17.2s, v1.2s, v8.s[0]
  723. OP_rr v20.2s, v0.2s, v8.s[1]
  724. OP_ii v20.2s, v1.2s, v9.s[1]
  725. OP_ri v21.2s, v0.2s, v9.s[1]
  726. OP_ir v21.2s, v1.2s, v8.s[1]
  727. .endm
  728. .macro SAVE2x2
  729. mov pCRow1, pCRow0
  730. ld2 {v0.2s, v1.2s}, [pCRow1]
  731. fmla v0.2s, v16.2s, alphaV0_R
  732. fmls v0.2s, v17.2s, alphaV0_I
  733. fmla v1.2s, v16.2s, alphaV1_I
  734. fmla v1.2s, v17.2s, alphaV1_R
  735. st2 {v0.2s, v1.2s}, [pCRow1]
  736. add pCRow1, pCRow1, LDC
  737. ld2 {v4.2s, v5.2s}, [pCRow1]
  738. fmla v4.2s, v20.2s, alphaV0_R
  739. fmls v4.2s, v21.2s, alphaV0_I
  740. fmla v5.2s, v20.2s, alphaV1_I
  741. fmla v5.2s, v21.2s, alphaV1_R
  742. st2 {v4.2s, v5.2s}, [pCRow1]
  743. add pCRow0, pCRow0, #16
  744. .endm
  745. /******************************************************************************/
  746. .macro INIT1x2
  747. fmov s16, wzr
  748. fmov s17, wzr
  749. fmov s20, wzr
  750. fmov s21, wzr
  751. .endm
  752. .macro KERNEL1x2_SUB
  753. ld2 {v8.2s, v9.2s}, [pB]
  754. add pB, pB, #16
  755. ld2 {v0.s, v1.s}[0], [pA]
  756. add pA, pA, #8
  757. OP_rr s16, s0, v8.s[0]
  758. OP_ii s16, s1, v9.s[0]
  759. OP_ri s17, s0, v9.s[0]
  760. OP_ir s17, s1, v8.s[0]
  761. OP_rr s20, s0, v8.s[1]
  762. OP_ii s20, s1, v9.s[1]
  763. OP_ri s21, s0, v9.s[1]
  764. OP_ir s21, s1, v8.s[1]
  765. .endm
  766. .macro SAVE1x2
  767. mov pCRow1, pCRow0
  768. ld2 {v0.s, v1.s}[0], [pCRow1]
  769. fmla s0, s16, alphaV0_R
  770. fmls s0, s17, alphaV0_I
  771. fmla s1, s16, alphaV1_I
  772. fmla s1, s17, alphaV1_R
  773. st2 {v0.s, v1.s}[0], [pCRow1]
  774. add pCRow1, pCRow1, LDC
  775. ld2 {v4.s, v5.s}[0], [pCRow1]
  776. fmla s4, s20, alphaV0_R
  777. fmls s4, s21, alphaV0_I
  778. fmla s5, s20, alphaV1_I
  779. fmla s5, s21, alphaV1_R
  780. st2 {v4.s, v5.s}[0], [pCRow1]
  781. add pCRow0, pCRow0, #8
  782. .endm
  783. /******************************************************************************/
  784. .macro INIT4x1
  785. fmov s16, wzr
  786. fmov s17, s16
  787. .endm
  788. .macro KERNEL4x1_SUB
  789. ld2 {v8.s, v9.s}[0], [pB]
  790. add pB, pB, #8
  791. ld2 {v0.4s, v1.4s}, [pA]
  792. add pA, pA, #32
  793. OP_rr v16.4s, v0.4s, v8.s[0]
  794. OP_ii v16.4s, v1.4s, v9.s[0]
  795. OP_ri v17.4s, v0.4s, v9.s[0]
  796. OP_ir v17.4s, v1.4s, v8.s[0]
  797. .endm
  798. .macro SAVE4x1
  799. mov pCRow1, pCRow0
  800. ld2 {v0.4s, v1.4s}, [pCRow1]
  801. fmla v0.4s, v16.4s, alphaV0_R
  802. fmls v0.4s, v17.4s, alphaV0_I
  803. fmla v1.4s, v16.4s, alphaV1_I
  804. fmla v1.4s, v17.4s, alphaV1_R
  805. st2 {v0.4s, v1.4s}, [pCRow1]
  806. add pCRow0, pCRow0, #32
  807. .endm
  808. /******************************************************************************/
  809. .macro INIT2x1
  810. fmov s16, wzr
  811. fmov s17, wzr
  812. .endm
  813. .macro KERNEL2x1_SUB
  814. ld2 {v8.s, v9.s}[0], [pB]
  815. add pB, pB, #8
  816. ld2 {v0.2s, v1.2s}, [pA]
  817. add pA, pA, #16
  818. OP_rr v16.2s, v0.2s, v8.s[0]
  819. OP_ii v16.2s, v1.2s, v9.s[0]
  820. OP_ri v17.2s, v0.2s, v9.s[0]
  821. OP_ir v17.2s, v1.2s, v8.s[0]
  822. .endm
  823. .macro SAVE2x1
  824. mov pCRow1, pCRow0
  825. ld2 {v0.2s, v1.2s}, [pCRow1]
  826. fmla v0.2s, v16.2s, alphaV0_R
  827. fmls v0.2s, v17.2s, alphaV0_I
  828. fmla v1.2s, v16.2s, alphaV1_I
  829. fmla v1.2s, v17.2s, alphaV1_R
  830. st2 {v0.2s, v1.2s}, [pCRow1]
  831. add pCRow0, pCRow0, #16
  832. .endm
  833. /******************************************************************************/
  834. .macro INIT1x1
  835. fmov s16, wzr
  836. fmov s17, wzr
  837. .endm
  838. .macro KERNEL1x1_SUB
  839. ld2 {v8.s, v9.s}[0], [pB]
  840. add pB, pB, #8
  841. ld2 {v0.s, v1.s}[0], [pA]
  842. add pA, pA, #8
  843. OP_rr s16, s0, v8.s[0]
  844. OP_ii s16, s1, v9.s[0]
  845. OP_ri s17, s0, v9.s[0]
  846. OP_ir s17, s1, v8.s[0]
  847. .endm
  848. .macro SAVE1x1
  849. mov pCRow1, pCRow0
  850. ld2 {v0.s, v1.s}[0], [pCRow1]
  851. fmla s0, s16, alphaV0_R
  852. fmls s0, s17, alphaV0_I
  853. fmla s1, s16, alphaV1_I
  854. fmla s1, s17, alphaV1_R
  855. st2 {v0.s, v1.s}[0], [pCRow1]
  856. add pCRow0, pCRow0, #8
  857. .endm
  858. /*******************************************************************************
  859. * End of macro definitions
  860. *******************************************************************************/
  861. PROLOGUE
  862. .align 5
  863. add sp, sp, #-(11 * 16)
  864. stp d8, d9, [sp, #(0 * 16)]
  865. stp d10, d11, [sp, #(1 * 16)]
  866. stp d12, d13, [sp, #(2 * 16)]
  867. stp d14, d15, [sp, #(3 * 16)]
  868. stp d16, d17, [sp, #(4 * 16)]
  869. stp x18, x19, [sp, #(5 * 16)]
  870. stp x20, x21, [sp, #(6 * 16)]
  871. stp x22, x23, [sp, #(7 * 16)]
  872. stp x24, x25, [sp, #(8 * 16)]
  873. stp x26, x27, [sp, #(9 * 16)]
  874. str x28, [sp, #(10 * 16)]
  875. fmov alpha0_R, s0
  876. fmov alpha0_I, s1
  877. fmov alpha1_R, s0
  878. fmov alpha1_I, s1
  879. lsl LDC, LDC, #3 // ldc = ldc * 8
  880. mov pB, origPB
  881. mov counterJ, origN
  882. asr counterJ, counterJ, #2 // J = J / 4
  883. cmp counterJ, #0
  884. ble .Lcgemm_kernel_L2_BEGIN
  885. /******************************************************************************/
  886. .Lcgemm_kernel_L4_BEGIN:
  887. mov pCRow0, pC // pCRow0 = C
  888. add pC, pC, LDC, lsl #2
  889. lsl temp, origK, #5 // k * 4 * 8
  890. mov pA, origPA // pA = start of A array
  891. add ppA, temp, pA
  892. .Lcgemm_kernel_L4_M8_BEGIN:
  893. mov counterI, origM
  894. asr counterI, counterI, #3 // counterI = counterI / 8
  895. cmp counterI, #0
  896. ble .Lcgemm_kernel_L4_M4_BEGIN
  897. .Lcgemm_kernel_L4_M8_20:
  898. mov pB, origPB
  899. asr counterL , origK, #1 // L = K / 2
  900. cmp counterL , #2 // is there at least 4 to do?
  901. blt .Lcgemm_kernel_L4_M8_32
  902. KERNEL8x4_I // do one in the K
  903. KERNEL8x4_M2 // do another in the K
  904. subs counterL, counterL, #2 // subtract 2
  905. ble .Lcgemm_kernel_L4_M8_22a
  906. .align 5
  907. .Lcgemm_kernel_L4_M8_22:
  908. KERNEL8x4_M1
  909. KERNEL8x4_M2
  910. subs counterL, counterL, #1
  911. bgt .Lcgemm_kernel_L4_M8_22
  912. .Lcgemm_kernel_L4_M8_22a:
  913. KERNEL8x4_M1
  914. KERNEL8x4_E
  915. b .Lcgemm_kernel_L4_M8_44
  916. .Lcgemm_kernel_L4_M8_32:
  917. tst counterL, #1
  918. ble .Lcgemm_kernel_L4_M8_40
  919. KERNEL8x4_I
  920. KERNEL8x4_E
  921. b .Lcgemm_kernel_L4_M8_44
  922. .Lcgemm_kernel_L4_M8_40:
  923. INIT8x4
  924. .Lcgemm_kernel_L4_M8_44:
  925. ands counterL , origK, #1
  926. ble .Lcgemm_kernel_L4_M8_100
  927. .Lcgemm_kernel_L4_M8_46:
  928. KERNEL8x4_SUB
  929. .Lcgemm_kernel_L4_M8_100:
  930. SAVE8x4
  931. .Lcgemm_kernel_L4_M8_END:
  932. lsl temp, origK, #5 // k * 4 * 8
  933. add pA, pA, temp
  934. add ppA, ppA, temp
  935. subs counterI, counterI, #1
  936. bne .Lcgemm_kernel_L4_M8_20
  937. .Lcgemm_kernel_L4_M4_BEGIN:
  938. mov counterI, origM
  939. tst counterI , #7
  940. ble .Lcgemm_kernel_L4_END
  941. tst counterI, #4
  942. ble .Lcgemm_kernel_L4_M2_BEGIN
  943. .Lcgemm_kernel_L4_M4_20:
  944. INIT4x4
  945. mov pB, origPB
  946. asr counterL, origK, #3 // counterL = counterL / 8
  947. cmp counterL, #0
  948. ble .Lcgemm_kernel_L4_M4_40
  949. .Lcgemm_kernel_L4_M4_22:
  950. KERNEL4x4_SUB
  951. KERNEL4x4_SUB
  952. KERNEL4x4_SUB
  953. KERNEL4x4_SUB
  954. KERNEL4x4_SUB
  955. KERNEL4x4_SUB
  956. KERNEL4x4_SUB
  957. KERNEL4x4_SUB
  958. subs counterL, counterL, #1
  959. bgt .Lcgemm_kernel_L4_M4_22
  960. .Lcgemm_kernel_L4_M4_40:
  961. ands counterL , origK, #7 // counterL = counterL % 8
  962. ble .Lcgemm_kernel_L4_M4_100
  963. .Lcgemm_kernel_L4_M4_42:
  964. KERNEL4x4_SUB
  965. subs counterL, counterL, #1
  966. bgt .Lcgemm_kernel_L4_M4_42
  967. .Lcgemm_kernel_L4_M4_100:
  968. SAVE4x4
  969. .Lcgemm_kernel_L4_M4_END:
  970. .Lcgemm_kernel_L4_M2_BEGIN:
  971. mov counterI, origM
  972. tst counterI , #3
  973. ble .Lcgemm_kernel_L4_END
  974. tst counterI, #2 // counterI = counterI / 2
  975. ble .Lcgemm_kernel_L4_M1_BEGIN
  976. .Lcgemm_kernel_L4_M2_20:
  977. INIT2x4
  978. mov pB, origPB
  979. asr counterL , origK, #3 // counterL = counterL / 8
  980. cmp counterL , #0
  981. ble .Lcgemm_kernel_L4_M2_40
  982. .Lcgemm_kernel_L4_M2_22:
  983. KERNEL2x4_SUB
  984. KERNEL2x4_SUB
  985. KERNEL2x4_SUB
  986. KERNEL2x4_SUB
  987. KERNEL2x4_SUB
  988. KERNEL2x4_SUB
  989. KERNEL2x4_SUB
  990. KERNEL2x4_SUB
  991. subs counterL, counterL, #1
  992. bgt .Lcgemm_kernel_L4_M2_22
  993. .Lcgemm_kernel_L4_M2_40:
  994. ands counterL , origK, #7 // counterL = counterL % 8
  995. ble .Lcgemm_kernel_L4_M2_100
  996. .Lcgemm_kernel_L4_M2_42:
  997. KERNEL2x4_SUB
  998. subs counterL, counterL, #1
  999. bgt .Lcgemm_kernel_L4_M2_42
  1000. .Lcgemm_kernel_L4_M2_100:
  1001. SAVE2x4
  1002. .Lcgemm_kernel_L4_M2_END:
  1003. .Lcgemm_kernel_L4_M1_BEGIN:
  1004. tst counterI, #1 // counterI = counterI % 2
  1005. ble .Lcgemm_kernel_L4_END
  1006. .Lcgemm_kernel_L4_M1_20:
  1007. INIT1x4
  1008. mov pB, origPB
  1009. asr counterL , origK, #3 // counterL = counterL / 8
  1010. cmp counterL , #0
  1011. ble .Lcgemm_kernel_L4_M1_40
  1012. .Lcgemm_kernel_L4_M1_22:
  1013. KERNEL1x4_SUB
  1014. KERNEL1x4_SUB
  1015. KERNEL1x4_SUB
  1016. KERNEL1x4_SUB
  1017. KERNEL1x4_SUB
  1018. KERNEL1x4_SUB
  1019. KERNEL1x4_SUB
  1020. KERNEL1x4_SUB
  1021. subs counterL, counterL, #1
  1022. bgt .Lcgemm_kernel_L4_M1_22
  1023. .Lcgemm_kernel_L4_M1_40:
  1024. ands counterL , origK, #7 // counterL = counterL % 8
  1025. ble .Lcgemm_kernel_L4_M1_100
  1026. .Lcgemm_kernel_L4_M1_42:
  1027. KERNEL1x4_SUB
  1028. subs counterL, counterL, #1
  1029. bgt .Lcgemm_kernel_L4_M1_42
  1030. .Lcgemm_kernel_L4_M1_100:
  1031. SAVE1x4
  1032. .Lcgemm_kernel_L4_END:
  1033. lsl temp, origK, #5
  1034. add origPB, origPB, temp // B = B + K * 4 * 8
  1035. subs counterJ, counterJ , #1 // j--
  1036. bgt .Lcgemm_kernel_L4_BEGIN
  1037. /******************************************************************************/
  1038. .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1039. mov counterJ , origN
  1040. tst counterJ , #3
  1041. ble .Lcgemm_kernel_L999 // error, N was less than 4?
  1042. tst counterJ , #2
  1043. ble .Lcgemm_kernel_L1_BEGIN
  1044. mov pCRow0, pC // pCRow0 = pC
  1045. add pC,pC,LDC, lsl #1
  1046. mov pA, origPA // pA = A
  1047. .Lcgemm_kernel_L2_M4_BEGIN:
  1048. mov counterI, origM
  1049. asr counterI, counterI, #2 // counterI = counterI / 4
  1050. cmp counterI,#0
  1051. ble .Lcgemm_kernel_L2_M2_BEGIN
  1052. .Lcgemm_kernel_L2_M4_20:
  1053. INIT4x2
  1054. mov pB, origPB
  1055. asr counterL , origK, #3 // counterL = counterL / 8
  1056. cmp counterL,#0
  1057. ble .Lcgemm_kernel_L2_M4_40
  1058. .align 5
  1059. .Lcgemm_kernel_L2_M4_22:
  1060. KERNEL4x2_SUB
  1061. KERNEL4x2_SUB
  1062. KERNEL4x2_SUB
  1063. KERNEL4x2_SUB
  1064. KERNEL4x2_SUB
  1065. KERNEL4x2_SUB
  1066. KERNEL4x2_SUB
  1067. KERNEL4x2_SUB
  1068. subs counterL, counterL, #1
  1069. bgt .Lcgemm_kernel_L2_M4_22
  1070. .Lcgemm_kernel_L2_M4_40:
  1071. ands counterL , origK, #7 // counterL = counterL % 8
  1072. ble .Lcgemm_kernel_L2_M4_100
  1073. .Lcgemm_kernel_L2_M4_42:
  1074. KERNEL4x2_SUB
  1075. subs counterL, counterL, #1
  1076. bgt .Lcgemm_kernel_L2_M4_42
  1077. .Lcgemm_kernel_L2_M4_100:
  1078. SAVE4x2
  1079. .Lcgemm_kernel_L2_M4_END:
  1080. subs counterI, counterI, #1
  1081. bgt .Lcgemm_kernel_L2_M4_20
  1082. .Lcgemm_kernel_L2_M2_BEGIN:
  1083. mov counterI, origM
  1084. tst counterI , #3
  1085. ble .Lcgemm_kernel_L2_END
  1086. tst counterI, #2 // counterI = counterI / 2
  1087. ble .Lcgemm_kernel_L2_M1_BEGIN
  1088. .Lcgemm_kernel_L2_M2_20:
  1089. INIT2x2
  1090. mov pB, origPB
  1091. asr counterL , origK, #3 // counterL = counterL / 8
  1092. cmp counterL,#0
  1093. ble .Lcgemm_kernel_L2_M2_40
  1094. .Lcgemm_kernel_L2_M2_22:
  1095. KERNEL2x2_SUB
  1096. KERNEL2x2_SUB
  1097. KERNEL2x2_SUB
  1098. KERNEL2x2_SUB
  1099. KERNEL2x2_SUB
  1100. KERNEL2x2_SUB
  1101. KERNEL2x2_SUB
  1102. KERNEL2x2_SUB
  1103. subs counterL, counterL, #1
  1104. bgt .Lcgemm_kernel_L2_M2_22
  1105. .Lcgemm_kernel_L2_M2_40:
  1106. ands counterL , origK, #7 // counterL = counterL % 8
  1107. ble .Lcgemm_kernel_L2_M2_100
  1108. .Lcgemm_kernel_L2_M2_42:
  1109. KERNEL2x2_SUB
  1110. subs counterL, counterL, #1
  1111. bgt .Lcgemm_kernel_L2_M2_42
  1112. .Lcgemm_kernel_L2_M2_100:
  1113. SAVE2x2
  1114. .Lcgemm_kernel_L2_M2_END:
  1115. .Lcgemm_kernel_L2_M1_BEGIN:
  1116. tst counterI, #1 // counterI = counterI % 2
  1117. ble .Lcgemm_kernel_L2_END
  1118. .Lcgemm_kernel_L2_M1_20:
  1119. INIT1x2
  1120. mov pB, origPB
  1121. asr counterL , origK, #3 // counterL = counterL / 8
  1122. cmp counterL, #0
  1123. ble .Lcgemm_kernel_L2_M1_40
  1124. .Lcgemm_kernel_L2_M1_22:
  1125. KERNEL1x2_SUB
  1126. KERNEL1x2_SUB
  1127. KERNEL1x2_SUB
  1128. KERNEL1x2_SUB
  1129. KERNEL1x2_SUB
  1130. KERNEL1x2_SUB
  1131. KERNEL1x2_SUB
  1132. KERNEL1x2_SUB
  1133. subs counterL, counterL, #1
  1134. bgt .Lcgemm_kernel_L2_M1_22
  1135. .Lcgemm_kernel_L2_M1_40:
  1136. ands counterL , origK, #7 // counterL = counterL % 8
  1137. ble .Lcgemm_kernel_L2_M1_100
  1138. .Lcgemm_kernel_L2_M1_42:
  1139. KERNEL1x2_SUB
  1140. subs counterL, counterL, #1
  1141. bgt .Lcgemm_kernel_L2_M1_42
  1142. .Lcgemm_kernel_L2_M1_100:
  1143. SAVE1x2
  1144. .Lcgemm_kernel_L2_END:
  1145. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1146. /******************************************************************************/
  1147. .Lcgemm_kernel_L1_BEGIN:
  1148. mov counterJ , origN
  1149. tst counterJ , #1
  1150. ble .Lcgemm_kernel_L999 // done
  1151. mov pCRow0, pC // pCRow0 = C
  1152. add pC , pC , LDC // Update pC to point to next
  1153. mov pA, origPA // pA = A
  1154. .Lcgemm_kernel_L1_M4_BEGIN:
  1155. mov counterI, origM
  1156. asr counterI, counterI, #2 // counterI = counterI / 4
  1157. cmp counterI, #0
  1158. ble .Lcgemm_kernel_L1_M2_BEGIN
  1159. .Lcgemm_kernel_L1_M4_20:
  1160. INIT4x1
  1161. mov pB, origPB
  1162. asr counterL , origK, #3 // counterL = counterL / 8
  1163. cmp counterL , #0
  1164. ble .Lcgemm_kernel_L1_M4_40
  1165. .align 5
  1166. .Lcgemm_kernel_L1_M4_22:
  1167. KERNEL4x1_SUB
  1168. KERNEL4x1_SUB
  1169. KERNEL4x1_SUB
  1170. KERNEL4x1_SUB
  1171. KERNEL4x1_SUB
  1172. KERNEL4x1_SUB
  1173. KERNEL4x1_SUB
  1174. KERNEL4x1_SUB
  1175. subs counterL, counterL, #1
  1176. bgt .Lcgemm_kernel_L1_M4_22
  1177. .Lcgemm_kernel_L1_M4_40:
  1178. ands counterL , origK, #7 // counterL = counterL % 8
  1179. ble .Lcgemm_kernel_L1_M4_100
  1180. .Lcgemm_kernel_L1_M4_42:
  1181. KERNEL4x1_SUB
  1182. subs counterL, counterL, #1
  1183. bgt .Lcgemm_kernel_L1_M4_42
  1184. .Lcgemm_kernel_L1_M4_100:
  1185. SAVE4x1
  1186. .Lcgemm_kernel_L1_M4_END:
  1187. subs counterI, counterI, #1
  1188. bgt .Lcgemm_kernel_L1_M4_20
  1189. .Lcgemm_kernel_L1_M2_BEGIN:
  1190. mov counterI, origM
  1191. tst counterI , #3
  1192. ble .Lcgemm_kernel_L1_END
  1193. tst counterI, #2 // counterI = counterI / 2
  1194. ble .Lcgemm_kernel_L1_M1_BEGIN
  1195. .Lcgemm_kernel_L1_M2_20:
  1196. INIT2x1
  1197. mov pB, origPB
  1198. asr counterL , origK, #3 // counterL = counterL / 8
  1199. cmp counterL , #0
  1200. ble .Lcgemm_kernel_L1_M2_40
  1201. .Lcgemm_kernel_L1_M2_22:
  1202. KERNEL2x1_SUB
  1203. KERNEL2x1_SUB
  1204. KERNEL2x1_SUB
  1205. KERNEL2x1_SUB
  1206. KERNEL2x1_SUB
  1207. KERNEL2x1_SUB
  1208. KERNEL2x1_SUB
  1209. KERNEL2x1_SUB
  1210. subs counterL, counterL, #1
  1211. bgt .Lcgemm_kernel_L1_M2_22
  1212. .Lcgemm_kernel_L1_M2_40:
  1213. ands counterL , origK, #7 // counterL = counterL % 8
  1214. ble .Lcgemm_kernel_L1_M2_100
  1215. .Lcgemm_kernel_L1_M2_42:
  1216. KERNEL2x1_SUB
  1217. subs counterL, counterL, #1
  1218. bgt .Lcgemm_kernel_L1_M2_42
  1219. .Lcgemm_kernel_L1_M2_100:
  1220. SAVE2x1
  1221. .Lcgemm_kernel_L1_M2_END:
  1222. .Lcgemm_kernel_L1_M1_BEGIN:
  1223. tst counterI, #1 // counterI = counterI % 2
  1224. ble .Lcgemm_kernel_L1_END
  1225. .Lcgemm_kernel_L1_M1_20:
  1226. INIT1x1
  1227. mov pB, origPB
  1228. asr counterL , origK, #3 // counterL = counterL / 8
  1229. cmp counterL , #0
  1230. ble .Lcgemm_kernel_L1_M1_40
  1231. .Lcgemm_kernel_L1_M1_22:
  1232. KERNEL1x1_SUB
  1233. KERNEL1x1_SUB
  1234. KERNEL1x1_SUB
  1235. KERNEL1x1_SUB
  1236. KERNEL1x1_SUB
  1237. KERNEL1x1_SUB
  1238. KERNEL1x1_SUB
  1239. KERNEL1x1_SUB
  1240. subs counterL, counterL, #1
  1241. bgt .Lcgemm_kernel_L1_M1_22
  1242. .Lcgemm_kernel_L1_M1_40:
  1243. ands counterL , origK, #7 // counterL = counterL % 8
  1244. ble .Lcgemm_kernel_L1_M1_100
  1245. .Lcgemm_kernel_L1_M1_42:
  1246. KERNEL1x1_SUB
  1247. subs counterL, counterL, #1
  1248. bgt .Lcgemm_kernel_L1_M1_42
  1249. .Lcgemm_kernel_L1_M1_100:
  1250. SAVE1x1
  1251. .Lcgemm_kernel_L1_END:
  1252. .Lcgemm_kernel_L999:
  1253. mov x0, #0 // set return value
  1254. ldp d8, d9, [sp, #(0 * 16)]
  1255. ldp d10, d11, [sp, #(1 * 16)]
  1256. ldp d12, d13, [sp, #(2 * 16)]
  1257. ldp d14, d15, [sp, #(3 * 16)]
  1258. ldp d16, d17, [sp, #(4 * 16)]
  1259. ldp x18, x19, [sp, #(5 * 16)]
  1260. ldp x20, x21, [sp, #(6 * 16)]
  1261. ldp x22, x23, [sp, #(7 * 16)]
  1262. ldp x24, x25, [sp, #(8 * 16)]
  1263. ldp x26, x27, [sp, #(9 * 16)]
  1264. ldr x28, [sp, #(10 * 16)]
  1265. add sp, sp, #(11*16)
  1266. ret
  1267. EPILOGUE