You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_4x4.S 33 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define tempOffset x17
  49. #define tempK x18
  50. #define alpha0_R s10
  51. #define alphaV0_R v10.s[0]
  52. #define alpha0_I s11
  53. #define alphaV0_I v11.s[0]
  54. #define alpha1_R s14
  55. #define alphaV1_R v14.s[0]
  56. #define alpha1_I s15
  57. #define alphaV1_I v15.s[0]
  58. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  59. #define OP_rr fmla
  60. #define OP_ii fmls
  61. #define OP_ri fmla
  62. #define OP_ir fmla
  63. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  64. #define OP_rr fmla
  65. #define OP_ii fmla
  66. #define OP_ri fmls
  67. #define OP_ir fmla
  68. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  69. #define OP_rr fmla
  70. #define OP_ii fmla
  71. #define OP_ri fmla
  72. #define OP_ir fmls
  73. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  74. #define OP_rr fmla
  75. #define OP_ii fmls
  76. #define OP_ri fmls
  77. #define OP_ir fmls
  78. #endif
  79. // 00 origM
  80. // 01 origN
  81. // 02 origK
  82. // 03 origPA
  83. // 04 origPB
  84. // 05 pC
  85. // 06 origLDC -> LDC
  86. // 07 offset
  87. // 08 counterL
  88. // 09 counterI
  89. // 10 counterJ
  90. // 11 pB
  91. // 12 pCRow0
  92. // 13 pCRow1
  93. // 14 pCRow2
  94. // 15 pA
  95. // 16 temp
  96. // 17 tempOffset
  97. // 18 must save tempK
  98. // 19 must save
  99. // 20 must save
  100. // 21 must save
  101. // 22 must save
  102. // 23 must save
  103. // 24 must save
  104. // 25 must save
  105. // 26 must save
  106. // 27 must save
  107. // 28 must save
  108. // 29 frame
  109. // 30 link
  110. // 31 sp
  111. //v00 ALPHA_R -> pA00_R, pA01_R, pA02_R, pA03_R
  112. //v01 ALPHA_I -> pA00_I, pA01_I, pA02_I, pA03_I
  113. //v02
  114. //v03
  115. //v04 pA10_R, pA11_R, pA12_R, pA13_R
  116. //v05 pA10_I, pA11_I, pA12_I, pA13_I
  117. //v06
  118. //v07
  119. //v08 must save pB00_R, pB01_R, pB02_R, pB03_R
  120. //v09 must save pB00_I, pB01_I, pB02_I, pB03_I
  121. //v10 must save ALPHA0_R
  122. //v11 must save ALPHA0_I
  123. //v12 must save pB10_R, pB11_R, pB12_R, pB13_R
  124. //v13 must save pB10_I, pB11_I, pB12_I, pB13_I
  125. //v14 must save ALPHA1_R
  126. //v15 must save ALPHA1_I
  127. //v16 must save pC00_R, pC01_R, pC02_R, pC03_R
  128. //v17 must save pC00_I, pC01_I, pC02_I, pC03_I
  129. //v18
  130. //v19
  131. //v20 pC10_R, pC11_R, pC12_R, pC13_R
  132. //v21 pC10_I, pC11_I, pC12_I, pC13_I
  133. //v22
  134. //v23
  135. //v24 pC20_R, pC21_R, pC22_R, pC23_R
  136. //v25 pC20_I, pC21_I, pC22_I, pC23_I
  137. //v26
  138. //v27
  139. //v28 pC30_R, pC31_R, pC32_R, pC33_R
  140. //v29 pC30_I, pC31_I, pC32_I, pC33_I
  141. //v30
  142. //v31
  143. /*******************************************************************************
  144. * Macro definitions
  145. *******************************************************************************/
  146. .macro INIT4x4
  147. fmov s16, wzr
  148. fmov s17, s16
  149. fmov s20, s17
  150. fmov s21, s16
  151. fmov s24, s17
  152. fmov s25, s16
  153. fmov s28, s17
  154. fmov s29, s16
  155. .endm
  156. .macro KERNEL4x4_I
  157. ld2 {v8.4s, v9.4s}, [pB]
  158. add pB, pB, #32
  159. ld2 {v0.4s, v1.4s}, [pA]
  160. add pA, pA, #32
  161. fmul v16.4s, v0.4s, v8.s[0]
  162. OP_ii v16.4s, v1.4s, v9.s[0]
  163. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  164. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  165. eor v17.16b, v17.16b, v17.16b
  166. fmls v17.4s, v0.4s, v9.s[0]
  167. #else
  168. fmul v17.4s, v0.4s, v9.s[0]
  169. #endif
  170. OP_ir v17.4s, v1.4s, v8.s[0]
  171. fmul v20.4s, v0.4s, v8.s[1]
  172. OP_ii v20.4s, v1.4s, v9.s[1]
  173. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  174. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  175. eor v21.16b, v21.16b, v21.16b
  176. fmls v21.4s, v0.4s, v9.s[1]
  177. #else
  178. fmul v21.4s, v0.4s, v9.s[1]
  179. #endif
  180. OP_ir v21.4s, v1.4s, v8.s[1]
  181. fmul v24.4s, v0.4s, v8.s[2]
  182. OP_ii v24.4s, v1.4s, v9.s[2]
  183. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  184. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  185. eor v25.16b, v25.16b, v25.16b
  186. fmls v25.4s, v0.4s, v9.s[2]
  187. #else
  188. fmul v25.4s, v0.4s, v9.s[2]
  189. #endif
  190. OP_ir v25.4s, v1.4s, v8.s[2]
  191. fmul v28.4s, v0.4s, v8.s[3]
  192. OP_ii v28.4s, v1.4s, v9.s[3]
  193. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  194. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  195. eor v29.16b, v29.16b, v29.16b
  196. fmls v29.4s, v0.4s, v9.s[3]
  197. #else
  198. fmul v29.4s, v0.4s, v9.s[3]
  199. #endif
  200. OP_ir v29.4s, v1.4s, v8.s[3]
  201. ld2 {v12.4s, v13.4s}, [pB]
  202. add pB, pB, #32
  203. ld2 {v4.4s, v5.4s}, [pA]
  204. add pA, pA, #32
  205. .endm
  206. .macro KERNEL4x4_M1
  207. OP_rr v16.4s, v0.4s, v8.s[0]
  208. OP_ii v16.4s, v1.4s, v9.s[0]
  209. OP_ri v17.4s, v0.4s, v9.s[0]
  210. OP_ir v17.4s, v1.4s, v8.s[0]
  211. ld2 {v12.4s, v13.4s}, [pB] // For next round
  212. add pB, pB, #32
  213. OP_rr v20.4s, v0.4s, v8.s[1]
  214. OP_ii v20.4s, v1.4s, v9.s[1]
  215. OP_ri v21.4s, v0.4s, v9.s[1]
  216. OP_ir v21.4s, v1.4s, v8.s[1]
  217. ld2 {v4.4s, v5.4s}, [pA] // For next round
  218. add pA, pA, #32
  219. OP_rr v24.4s, v0.4s, v8.s[2]
  220. OP_ii v24.4s, v1.4s, v9.s[2]
  221. OP_ri v25.4s, v0.4s, v9.s[2]
  222. OP_ir v25.4s, v1.4s, v8.s[2]
  223. prfm PLDL1KEEP, [pA, #512]
  224. OP_rr v28.4s, v0.4s, v8.s[3]
  225. OP_ii v28.4s, v1.4s, v9.s[3]
  226. OP_ri v29.4s, v0.4s, v9.s[3]
  227. OP_ir v29.4s, v1.4s, v8.s[3]
  228. .endm
  229. .macro KERNEL4x4_M2
  230. OP_rr v16.4s, v4.4s, v12.s[0]
  231. OP_ii v16.4s, v5.4s, v13.s[0]
  232. OP_ri v17.4s, v4.4s, v13.s[0]
  233. OP_ir v17.4s, v5.4s, v12.s[0]
  234. ld2 {v8.4s, v9.4s}, [pB] // For next round
  235. add pB, pB, #32
  236. OP_rr v20.4s, v4.4s, v12.s[1]
  237. OP_ii v20.4s, v5.4s, v13.s[1]
  238. OP_ri v21.4s, v4.4s, v13.s[1]
  239. OP_ir v21.4s, v5.4s, v12.s[1]
  240. ld2 {v0.4s, v1.4s}, [pA] // For next round
  241. add pA, pA, #32
  242. OP_rr v24.4s, v4.4s, v12.s[2]
  243. OP_ii v24.4s, v5.4s, v13.s[2]
  244. OP_ri v25.4s, v4.4s, v13.s[2]
  245. OP_ir v25.4s, v5.4s, v12.s[2]
  246. prfm PLDL1KEEP, [pB, #512]
  247. OP_rr v28.4s, v4.4s, v12.s[3]
  248. OP_ii v28.4s, v5.4s, v13.s[3]
  249. OP_ri v29.4s, v4.4s, v13.s[3]
  250. OP_ir v29.4s, v5.4s, v12.s[3]
  251. .endm
  252. .macro KERNEL4x4_E
  253. OP_rr v16.4s, v4.4s, v12.s[0]
  254. OP_ii v16.4s, v5.4s, v13.s[0]
  255. OP_ri v17.4s, v4.4s, v13.s[0]
  256. OP_ir v17.4s, v5.4s, v12.s[0]
  257. OP_rr v20.4s, v4.4s, v12.s[1]
  258. OP_ii v20.4s, v5.4s, v13.s[1]
  259. OP_ri v21.4s, v4.4s, v13.s[1]
  260. OP_ir v21.4s, v5.4s, v12.s[1]
  261. OP_rr v24.4s, v4.4s, v12.s[2]
  262. OP_ii v24.4s, v5.4s, v13.s[2]
  263. OP_ri v25.4s, v4.4s, v13.s[2]
  264. OP_ir v25.4s, v5.4s, v12.s[2]
  265. OP_rr v28.4s, v4.4s, v12.s[3]
  266. OP_ii v28.4s, v5.4s, v13.s[3]
  267. OP_ri v29.4s, v4.4s, v13.s[3]
  268. OP_ir v29.4s, v5.4s, v12.s[3]
  269. .endm
  270. .macro KERNEL4x4_SUB
  271. ld2 {v8.4s, v9.4s}, [pB]
  272. add pB, pB, #32
  273. ld2 {v0.4s, v1.4s}, [pA]
  274. add pA, pA, #32
  275. OP_rr v16.4s, v0.4s, v8.s[0]
  276. OP_ii v16.4s, v1.4s, v9.s[0]
  277. OP_ri v17.4s, v0.4s, v9.s[0]
  278. OP_ir v17.4s, v1.4s, v8.s[0]
  279. OP_rr v20.4s, v0.4s, v8.s[1]
  280. OP_ii v20.4s, v1.4s, v9.s[1]
  281. OP_ri v21.4s, v0.4s, v9.s[1]
  282. OP_ir v21.4s, v1.4s, v8.s[1]
  283. OP_rr v24.4s, v0.4s, v8.s[2]
  284. OP_ii v24.4s, v1.4s, v9.s[2]
  285. OP_ri v25.4s, v0.4s, v9.s[2]
  286. OP_ir v25.4s, v1.4s, v8.s[2]
  287. OP_rr v28.4s, v0.4s, v8.s[3]
  288. OP_ii v28.4s, v1.4s, v9.s[3]
  289. OP_ri v29.4s, v0.4s, v9.s[3]
  290. OP_ir v29.4s, v1.4s, v8.s[3]
  291. .endm
  292. .macro SAVE4x4
  293. mov pCRow1, pCRow0
  294. fmul v0.4s, v16.4s, alphaV0_R
  295. fmls v0.4s, v17.4s, alphaV0_I
  296. fmul v1.4s, v16.4s, alphaV1_I
  297. fmla v1.4s, v17.4s, alphaV1_R
  298. st2 {v0.4s, v1.4s}, [pCRow1]
  299. add pCRow1, pCRow1, LDC
  300. fmul v4.4s, v20.4s, alphaV0_R
  301. fmls v4.4s, v21.4s, alphaV0_I
  302. fmul v5.4s, v20.4s, alphaV1_I
  303. fmla v5.4s, v21.4s, alphaV1_R
  304. st2 {v4.4s, v5.4s}, [pCRow1]
  305. add pCRow1, pCRow1, LDC
  306. fmul v0.4s, v24.4s, alphaV0_R
  307. fmls v0.4s, v25.4s, alphaV0_I
  308. fmul v1.4s, v24.4s, alphaV1_I
  309. fmla v1.4s, v25.4s, alphaV1_R
  310. st2 {v0.4s, v1.4s}, [pCRow1]
  311. add pCRow1, pCRow1, LDC
  312. fmul v4.4s, v28.4s, alphaV0_R
  313. fmls v4.4s, v29.4s, alphaV0_I
  314. fmul v5.4s, v28.4s, alphaV1_I
  315. fmla v5.4s, v29.4s, alphaV1_R
  316. st2 {v4.4s, v5.4s}, [pCRow1]
  317. add pCRow0, pCRow0, #32
  318. .endm
  319. /******************************************************************************/
  320. .macro INIT2x4
  321. fmov s16, wzr
  322. fmov s17, wzr
  323. fmov s20, s16
  324. fmov s21, s17
  325. fmov s24, s16
  326. fmov s25, s17
  327. fmov s28, s16
  328. fmov s29, s17
  329. .endm
  330. .macro KERNEL2x4_SUB
  331. ld2 {v8.4s, v9.4s}, [pB]
  332. add pB, pB, #32
  333. ld2 {v0.2s, v1.2s}, [pA]
  334. add pA, pA, #16
  335. OP_rr v16.2s, v0.2s, v8.s[0]
  336. OP_ii v16.2s, v1.2s, v9.s[0]
  337. OP_ri v17.2s, v0.2s, v9.s[0]
  338. OP_ir v17.2s, v1.2s, v8.s[0]
  339. OP_rr v20.2s, v0.2s, v8.s[1]
  340. OP_ii v20.2s, v1.2s, v9.s[1]
  341. OP_ri v21.2s, v0.2s, v9.s[1]
  342. OP_ir v21.2s, v1.2s, v8.s[1]
  343. OP_rr v24.2s, v0.2s, v8.s[2]
  344. OP_ii v24.2s, v1.2s, v9.s[2]
  345. OP_ri v25.2s, v0.2s, v9.s[2]
  346. OP_ir v25.2s, v1.2s, v8.s[2]
  347. OP_rr v28.2s, v0.2s, v8.s[3]
  348. OP_ii v28.2s, v1.2s, v9.s[3]
  349. OP_ri v29.2s, v0.2s, v9.s[3]
  350. OP_ir v29.2s, v1.2s, v8.s[3]
  351. .endm
  352. .macro SAVE2x4
  353. mov pCRow1, pCRow0
  354. fmul v0.2s, v16.2s, alphaV0_R
  355. fmls v0.2s, v17.2s, alphaV0_I
  356. fmul v1.2s, v16.2s, alphaV1_I
  357. fmla v1.2s, v17.2s, alphaV1_R
  358. st2 {v0.2s, v1.2s}, [pCRow1]
  359. add pCRow1, pCRow1, LDC
  360. fmul v4.2s, v20.2s, alphaV0_R
  361. fmls v4.2s, v21.2s, alphaV0_I
  362. fmul v5.2s, v20.2s, alphaV1_I
  363. fmla v5.2s, v21.2s, alphaV1_R
  364. st2 {v4.2s, v5.2s}, [pCRow1]
  365. add pCRow1, pCRow1, LDC
  366. fmul v0.2s, v24.2s, alphaV0_R
  367. fmls v0.2s, v25.2s, alphaV0_I
  368. fmul v1.2s, v24.2s, alphaV1_I
  369. fmla v1.2s, v25.2s, alphaV1_R
  370. st2 {v0.2s, v1.2s}, [pCRow1]
  371. add pCRow1, pCRow1, LDC
  372. fmul v4.2s, v28.2s, alphaV0_R
  373. fmls v4.2s, v29.2s, alphaV0_I
  374. fmul v5.2s, v28.2s, alphaV1_I
  375. fmla v5.2s, v29.2s, alphaV1_R
  376. st2 {v4.2s, v5.2s}, [pCRow1]
  377. add pCRow0, pCRow0, #16
  378. .endm
  379. /******************************************************************************/
  380. .macro INIT1x4
  381. fmov s16, wzr
  382. fmov s17, wzr
  383. fmov s20, s16
  384. fmov s21, s17
  385. fmov s24, s16
  386. fmov s25, s17
  387. fmov s28, s16
  388. fmov s29, s17
  389. .endm
  390. .macro KERNEL1x4_SUB
  391. ld2 {v8.4s, v9.4s}, [pB]
  392. add pB, pB, #32
  393. ld2 {v0.s, v1.s}[0], [pA]
  394. add pA, pA, #8
  395. OP_rr s16, s0, v8.s[0]
  396. OP_ii s16, s1, v9.s[0]
  397. OP_ri s17, s0, v9.s[0]
  398. OP_ir s17, s1, v8.s[0]
  399. OP_rr s20, s0, v8.s[1]
  400. OP_ii s20, s1, v9.s[1]
  401. OP_ri s21, s0, v9.s[1]
  402. OP_ir s21, s1, v8.s[1]
  403. OP_rr s24, s0, v8.s[2]
  404. OP_ii s24, s1, v9.s[2]
  405. OP_ri s25, s0, v9.s[2]
  406. OP_ir s25, s1, v8.s[2]
  407. OP_rr s28, s0, v8.s[3]
  408. OP_ii s28, s1, v9.s[3]
  409. OP_ri s29, s0, v9.s[3]
  410. OP_ir s29, s1, v8.s[3]
  411. .endm
  412. .macro SAVE1x4
  413. mov pCRow1, pCRow0
  414. fmul s0, s16, alphaV0_R
  415. fmls s0, s17, alphaV0_I
  416. fmul s1, s16, alphaV1_I
  417. fmla s1, s17, alphaV1_R
  418. st2 {v0.s, v1.s}[0], [pCRow1]
  419. add pCRow1, pCRow1, LDC
  420. fmul s4, s20, alphaV0_R
  421. fmls s4, s21, alphaV0_I
  422. fmul s5, s20, alphaV1_I
  423. fmla s5, s21, alphaV1_R
  424. st2 {v4.s, v5.s}[0], [pCRow1]
  425. add pCRow1, pCRow1, LDC
  426. fmul s0, s24, alphaV0_R
  427. fmls s0, s25, alphaV0_I
  428. fmul s1, s24, alphaV1_I
  429. fmla s1, s25, alphaV1_R
  430. st2 {v0.s, v1.s}[0], [pCRow1]
  431. add pCRow1, pCRow1, LDC
  432. fmul s4, s28, alphaV0_R
  433. fmls s4, s29, alphaV0_I
  434. fmul s5, s28, alphaV1_I
  435. fmla s5, s29, alphaV1_R
  436. st2 {v4.s, v5.s}[0], [pCRow1]
  437. add pCRow0, pCRow0, #8
  438. .endm
  439. /******************************************************************************/
  440. .macro INIT4x2
  441. fmov s16, wzr
  442. fmov s17, wzr
  443. fmov s20, s16
  444. fmov s21, s17
  445. .endm
  446. .macro KERNEL4x2_SUB
  447. ld2 {v8.2s, v9.2s}, [pB]
  448. add pB, pB, #16
  449. ld2 {v0.4s, v1.4s}, [pA]
  450. add pA, pA, #32
  451. OP_rr v16.4s, v0.4s, v8.s[0]
  452. OP_ii v16.4s, v1.4s, v9.s[0]
  453. OP_ri v17.4s, v0.4s, v9.s[0]
  454. OP_ir v17.4s, v1.4s, v8.s[0]
  455. OP_rr v20.4s, v0.4s, v8.s[1]
  456. OP_ii v20.4s, v1.4s, v9.s[1]
  457. OP_ri v21.4s, v0.4s, v9.s[1]
  458. OP_ir v21.4s, v1.4s, v8.s[1]
  459. .endm
  460. .macro SAVE4x2
  461. mov pCRow1, pCRow0
  462. fmul v0.4s, v16.4s, alphaV0_R
  463. fmls v0.4s, v17.4s, alphaV0_I
  464. fmul v1.4s, v16.4s, alphaV1_I
  465. fmla v1.4s, v17.4s, alphaV1_R
  466. st2 {v0.4s, v1.4s}, [pCRow1]
  467. add pCRow1, pCRow1, LDC
  468. fmul v4.4s, v20.4s, alphaV0_R
  469. fmls v4.4s, v21.4s, alphaV0_I
  470. fmul v5.4s, v20.4s, alphaV1_I
  471. fmla v5.4s, v21.4s, alphaV1_R
  472. st2 {v4.4s, v5.4s}, [pCRow1]
  473. add pCRow0, pCRow0, #32
  474. .endm
  475. /******************************************************************************/
  476. .macro INIT2x2
  477. fmov s16, wzr
  478. fmov s17, wzr
  479. fmov s20, s16
  480. fmov s21, s17
  481. .endm
  482. .macro KERNEL2x2_SUB
  483. ld2 {v8.2s, v9.2s}, [pB]
  484. add pB, pB, #16
  485. ld2 {v0.2s, v1.2s}, [pA]
  486. add pA, pA, #16
  487. OP_rr v16.2s, v0.2s, v8.s[0]
  488. OP_ii v16.2s, v1.2s, v9.s[0]
  489. OP_ri v17.2s, v0.2s, v9.s[0]
  490. OP_ir v17.2s, v1.2s, v8.s[0]
  491. OP_rr v20.2s, v0.2s, v8.s[1]
  492. OP_ii v20.2s, v1.2s, v9.s[1]
  493. OP_ri v21.2s, v0.2s, v9.s[1]
  494. OP_ir v21.2s, v1.2s, v8.s[1]
  495. .endm
  496. .macro SAVE2x2
  497. mov pCRow1, pCRow0
  498. fmul v0.2s, v16.2s, alphaV0_R
  499. fmls v0.2s, v17.2s, alphaV0_I
  500. fmul v1.2s, v16.2s, alphaV1_I
  501. fmla v1.2s, v17.2s, alphaV1_R
  502. st2 {v0.2s, v1.2s}, [pCRow1]
  503. add pCRow1, pCRow1, LDC
  504. fmul v4.2s, v20.2s, alphaV0_R
  505. fmls v4.2s, v21.2s, alphaV0_I
  506. fmul v5.2s, v20.2s, alphaV1_I
  507. fmla v5.2s, v21.2s, alphaV1_R
  508. st2 {v4.2s, v5.2s}, [pCRow1]
  509. add pCRow0, pCRow0, #16
  510. .endm
  511. /******************************************************************************/
  512. .macro INIT1x2
  513. fmov s16, wzr
  514. fmov s17, wzr
  515. fmov s20, wzr
  516. fmov s21, wzr
  517. .endm
  518. .macro KERNEL1x2_SUB
  519. ld2 {v8.2s, v9.2s}, [pB]
  520. add pB, pB, #16
  521. ld2 {v0.s, v1.s}[0], [pA]
  522. add pA, pA, #8
  523. OP_rr s16, s0, v8.s[0]
  524. OP_ii s16, s1, v9.s[0]
  525. OP_ri s17, s0, v9.s[0]
  526. OP_ir s17, s1, v8.s[0]
  527. OP_rr s20, s0, v8.s[1]
  528. OP_ii s20, s1, v9.s[1]
  529. OP_ri s21, s0, v9.s[1]
  530. OP_ir s21, s1, v8.s[1]
  531. .endm
  532. .macro SAVE1x2
  533. mov pCRow1, pCRow0
  534. fmul s0, s16, alphaV0_R
  535. fmls s0, s17, alphaV0_I
  536. fmul s1, s16, alphaV1_I
  537. fmla s1, s17, alphaV1_R
  538. st2 {v0.s, v1.s}[0], [pCRow1]
  539. add pCRow1, pCRow1, LDC
  540. fmul s4, s20, alphaV0_R
  541. fmls s4, s21, alphaV0_I
  542. fmul s5, s20, alphaV1_I
  543. fmla s5, s21, alphaV1_R
  544. st2 {v4.s, v5.s}[0], [pCRow1]
  545. add pCRow0, pCRow0, #8
  546. .endm
  547. /******************************************************************************/
  548. .macro INIT4x1
  549. fmov s16, wzr
  550. fmov s17, s16
  551. .endm
  552. .macro KERNEL4x1_SUB
  553. ld2 {v8.s, v9.s}[0], [pB]
  554. add pB, pB, #8
  555. ld2 {v0.4s, v1.4s}, [pA]
  556. add pA, pA, #32
  557. OP_rr v16.4s, v0.4s, v8.s[0]
  558. OP_ii v16.4s, v1.4s, v9.s[0]
  559. OP_ri v17.4s, v0.4s, v9.s[0]
  560. OP_ir v17.4s, v1.4s, v8.s[0]
  561. .endm
  562. .macro SAVE4x1
  563. mov pCRow1, pCRow0
  564. fmul v0.4s, v16.4s, alphaV0_R
  565. fmls v0.4s, v17.4s, alphaV0_I
  566. fmul v1.4s, v16.4s, alphaV1_I
  567. fmla v1.4s, v17.4s, alphaV1_R
  568. st2 {v0.4s, v1.4s}, [pCRow1]
  569. add pCRow0, pCRow0, #32
  570. .endm
  571. /******************************************************************************/
  572. .macro INIT2x1
  573. fmov s16, wzr
  574. fmov s17, wzr
  575. .endm
  576. .macro KERNEL2x1_SUB
  577. ld2 {v8.s, v9.s}[0], [pB]
  578. add pB, pB, #8
  579. ld2 {v0.2s, v1.2s}, [pA]
  580. add pA, pA, #16
  581. OP_rr v16.2s, v0.2s, v8.s[0]
  582. OP_ii v16.2s, v1.2s, v9.s[0]
  583. OP_ri v17.2s, v0.2s, v9.s[0]
  584. OP_ir v17.2s, v1.2s, v8.s[0]
  585. .endm
  586. .macro SAVE2x1
  587. mov pCRow1, pCRow0
  588. fmul v0.2s, v16.2s, alphaV0_R
  589. fmls v0.2s, v17.2s, alphaV0_I
  590. fmul v1.2s, v16.2s, alphaV1_I
  591. fmla v1.2s, v17.2s, alphaV1_R
  592. st2 {v0.2s, v1.2s}, [pCRow1]
  593. add pCRow0, pCRow0, #16
  594. .endm
  595. /******************************************************************************/
  596. .macro INIT1x1
  597. fmov s16, wzr
  598. fmov s17, wzr
  599. .endm
  600. .macro KERNEL1x1_SUB
  601. ld2 {v8.s, v9.s}[0], [pB]
  602. add pB, pB, #8
  603. ld2 {v0.s, v1.s}[0], [pA]
  604. add pA, pA, #8
  605. OP_rr s16, s0, v8.s[0]
  606. OP_ii s16, s1, v9.s[0]
  607. OP_ri s17, s0, v9.s[0]
  608. OP_ir s17, s1, v8.s[0]
  609. .endm
  610. .macro SAVE1x1
  611. mov pCRow1, pCRow0
  612. fmul s0, s16, alphaV0_R
  613. fmls s0, s17, alphaV0_I
  614. fmul s1, s16, alphaV1_I
  615. fmla s1, s17, alphaV1_R
  616. st2 {v0.s, v1.s}[0], [pCRow1]
  617. add pCRow0, pCRow0, #8
  618. .endm
  619. /*******************************************************************************
  620. * End of macro definitions
  621. *******************************************************************************/
  622. PROLOGUE
  623. .align 5
  624. add sp, sp, #-(11 * 16)
  625. stp d8, d9, [sp, #(0 * 16)]
  626. stp d10, d11, [sp, #(1 * 16)]
  627. stp d12, d13, [sp, #(2 * 16)]
  628. stp d14, d15, [sp, #(3 * 16)]
  629. stp d16, d17, [sp, #(4 * 16)]
  630. stp x18, x19, [sp, #(5 * 16)]
  631. stp x20, x21, [sp, #(6 * 16)]
  632. stp x22, x23, [sp, #(7 * 16)]
  633. stp x24, x25, [sp, #(8 * 16)]
  634. stp x26, x27, [sp, #(9 * 16)]
  635. str x28, [sp, #(10 * 16)]
  636. fmov alpha0_R, s0
  637. fmov alpha0_I, s1
  638. fmov alpha1_R, s0
  639. fmov alpha1_I, s1
  640. lsl LDC, LDC, #3 // ldc = ldc * 8
  641. #if !defined(LEFT)
  642. neg tempOffset, offset
  643. #endif
  644. mov pB, origPB
  645. mov counterJ, origN
  646. asr counterJ, counterJ, #2 // J = J / 4
  647. cmp counterJ, #0
  648. ble .Lctrmm_kernel_L2_BEGIN
  649. /******************************************************************************/
  650. .Lctrmm_kernel_L4_BEGIN:
  651. mov pCRow0, pC // pCRow0 = C
  652. add pC, pC, LDC, lsl #2
  653. #if defined(LEFT)
  654. mov tempOffset, offset
  655. #endif
  656. mov pA, origPA // pA = start of A array
  657. .Lctrmm_kernel_L4_M4_BEGIN:
  658. mov counterI, origM
  659. asr counterI, counterI, #2 // counterI = counterI / 4
  660. cmp counterI, #0
  661. ble .Lctrmm_kernel_L4_M2_BEGIN
  662. .Lctrmm_kernel_L4_M4_20:
  663. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  664. mov pB, origPB
  665. #else
  666. mov pB, origPB
  667. lsl temp, tempOffset, #5
  668. add pB, pB, temp
  669. add pA, pA, temp
  670. #endif
  671. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  672. sub tempK, origK, tempOffset
  673. #elif defined(LEFT)
  674. add tempK, tempOffset, #4
  675. #else
  676. add tempK, tempOffset, #4
  677. #endif
  678. asr counterL , tempK, #1 // L = K / 2
  679. cmp counterL , #2 // is there at least 4 to do?
  680. blt .Lctrmm_kernel_L4_M4_32
  681. KERNEL4x4_I // do one in the K
  682. KERNEL4x4_M2 // do another in the K
  683. subs counterL, counterL, #2
  684. ble .Lctrmm_kernel_L4_M4_22a
  685. .align 5
  686. .Lctrmm_kernel_L4_M4_22:
  687. KERNEL4x4_M1
  688. KERNEL4x4_M2
  689. subs counterL, counterL, #1
  690. bgt .Lctrmm_kernel_L4_M4_22
  691. .Lctrmm_kernel_L4_M4_22a:
  692. KERNEL4x4_M1
  693. KERNEL4x4_E
  694. b .Lctrmm_kernel_L4_M4_44
  695. .Lctrmm_kernel_L4_M4_32:
  696. tst counterL, #1
  697. ble .Lctrmm_kernel_L4_M4_40
  698. KERNEL4x4_I
  699. KERNEL4x4_E
  700. b .Lctrmm_kernel_L4_M4_44
  701. .Lctrmm_kernel_L4_M4_40:
  702. INIT4x4
  703. .Lctrmm_kernel_L4_M4_44:
  704. ands counterL , tempK, #1
  705. ble .Lctrmm_kernel_L4_M4_100
  706. .Lctrmm_kernel_L4_M4_46:
  707. KERNEL4x4_SUB
  708. .Lctrmm_kernel_L4_M4_100:
  709. SAVE4x4
  710. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  711. sub tempK, origK, tempOffset
  712. #if defined(LEFT)
  713. sub tempK, tempK, #4
  714. #else
  715. sub tempK, tempK, #4
  716. #endif
  717. lsl temp, tempK, #5
  718. add pA, pA, temp
  719. add pB, pB, temp
  720. #endif
  721. #if defined(LEFT)
  722. add tempOffset, tempOffset, #4
  723. #endif
  724. .Lctrmm_kernel_L4_M4_END:
  725. subs counterI, counterI, #1
  726. bne .Lctrmm_kernel_L4_M4_20
  727. .Lctrmm_kernel_L4_M2_BEGIN:
  728. mov counterI, origM
  729. tst counterI , #3
  730. ble .Lctrmm_kernel_L4_END
  731. tst counterI, #2 // counterI = counterI / 2
  732. ble .Lctrmm_kernel_L4_M1_BEGIN
  733. .Lctrmm_kernel_L4_M2_20:
  734. INIT2x4
  735. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  736. mov pB, origPB
  737. #else
  738. mov pB, origPB
  739. lsl temp, tempOffset, #4
  740. add pA, pA, temp
  741. lsl temp, tempOffset, #5
  742. add pB, pB, temp
  743. #endif
  744. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  745. sub tempK, origK, tempOffset
  746. #elif defined(LEFT)
  747. add tempK, tempOffset, #2
  748. #else
  749. add tempK, tempOffset, #4
  750. #endif
  751. asr counterL , tempK, #3 // counterL = counterL / 8
  752. cmp counterL , #0
  753. ble .Lctrmm_kernel_L4_M2_40
  754. .Lctrmm_kernel_L4_M2_22:
  755. KERNEL2x4_SUB
  756. KERNEL2x4_SUB
  757. KERNEL2x4_SUB
  758. KERNEL2x4_SUB
  759. KERNEL2x4_SUB
  760. KERNEL2x4_SUB
  761. KERNEL2x4_SUB
  762. KERNEL2x4_SUB
  763. subs counterL, counterL, #1
  764. bgt .Lctrmm_kernel_L4_M2_22
  765. .Lctrmm_kernel_L4_M2_40:
  766. ands counterL , tempK, #7 // counterL = counterL % 8
  767. ble .Lctrmm_kernel_L4_M2_100
  768. .Lctrmm_kernel_L4_M2_42:
  769. KERNEL2x4_SUB
  770. subs counterL, counterL, #1
  771. bgt .Lctrmm_kernel_L4_M2_42
  772. .Lctrmm_kernel_L4_M2_100:
  773. SAVE2x4
  774. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  775. sub tempK, origK, tempOffset
  776. #if defined(LEFT)
  777. sub tempK, tempK, #2
  778. #else
  779. sub tempK, tempK, #4
  780. #endif
  781. lsl temp, tempK, #4
  782. add pA, pA, temp
  783. lsl temp, tempK, #5
  784. add pB, pB, temp
  785. #endif
  786. #if defined(LEFT)
  787. add tempOffset, tempOffset, #2
  788. #endif
  789. .Lctrmm_kernel_L4_M2_END:
  790. .Lctrmm_kernel_L4_M1_BEGIN:
  791. tst counterI, #1 // counterI = counterI % 2
  792. ble .Lctrmm_kernel_L4_END
  793. .Lctrmm_kernel_L4_M1_20:
  794. INIT1x4
  795. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  796. mov pB, origPB
  797. #else
  798. mov pB, origPB
  799. lsl temp, tempOffset, #5
  800. add pB, pB, temp
  801. lsl temp, tempOffset, #3
  802. add pA, pA, temp
  803. #endif
  804. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  805. sub tempK, origK, tempOffset
  806. #elif defined(LEFT)
  807. add tempK, tempOffset, #1
  808. #else
  809. add tempK, tempOffset, #4
  810. #endif
  811. asr counterL , tempK, #3 // counterL = counterL / 8
  812. cmp counterL , #0
  813. ble .Lctrmm_kernel_L4_M1_40
  814. .Lctrmm_kernel_L4_M1_22:
  815. KERNEL1x4_SUB
  816. KERNEL1x4_SUB
  817. KERNEL1x4_SUB
  818. KERNEL1x4_SUB
  819. KERNEL1x4_SUB
  820. KERNEL1x4_SUB
  821. KERNEL1x4_SUB
  822. KERNEL1x4_SUB
  823. subs counterL, counterL, #1
  824. bgt .Lctrmm_kernel_L4_M1_22
  825. .Lctrmm_kernel_L4_M1_40:
  826. ands counterL , tempK, #7 // counterL = counterL % 8
  827. ble .Lctrmm_kernel_L4_M1_100
  828. .Lctrmm_kernel_L4_M1_42:
  829. KERNEL1x4_SUB
  830. subs counterL, counterL, #1
  831. bgt .Lctrmm_kernel_L4_M1_42
  832. .Lctrmm_kernel_L4_M1_100:
  833. SAVE1x4
  834. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  835. sub tempK, origK, tempOffset
  836. #if defined(LEFT)
  837. sub tempK, tempK, #1
  838. #else
  839. sub tempK, tempK, #4
  840. #endif
  841. lsl temp, tempK, #3
  842. add pA, pA, temp
  843. lsl temp, tempK, #5
  844. add pB, pB, temp
  845. #endif
  846. #if defined(LEFT)
  847. add tempOffset, tempOffset, #1
  848. #endif
  849. .Lctrmm_kernel_L4_END:
  850. lsl temp, origK, #5
  851. add origPB, origPB, temp // B = B + K * 4 * 8
  852. #if !defined(LEFT)
  853. add tempOffset, tempOffset, #4
  854. #endif
  855. subs counterJ, counterJ , #1 // j--
  856. bgt .Lctrmm_kernel_L4_BEGIN
  857. /******************************************************************************/
  858. .Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  859. mov counterJ , origN
  860. tst counterJ , #3
  861. ble .Lctrmm_kernel_L999 // error, N was less than 4?
  862. tst counterJ , #2
  863. ble .Lctrmm_kernel_L1_BEGIN
  864. mov pCRow0, pC // pCRow0 = pC
  865. add pC,pC,LDC, lsl #1
  866. #if defined(LEFT)
  867. mov tempOffset, offset
  868. #endif
  869. mov pA, origPA // pA = A
  870. .Lctrmm_kernel_L2_M4_BEGIN:
  871. mov counterI, origM
  872. asr counterI, counterI, #2 // counterI = counterI / 4
  873. cmp counterI,#0
  874. ble .Lctrmm_kernel_L2_M2_BEGIN
  875. .Lctrmm_kernel_L2_M4_20:
  876. INIT4x2
  877. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  878. mov pB, origPB
  879. #else
  880. mov pB, origPB
  881. lsl temp, tempOffset, #4
  882. add pB, pB, temp
  883. lsl temp, tempOffset, #5
  884. add pA, pA, temp
  885. #endif
  886. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  887. sub tempK, origK, tempOffset
  888. #elif defined(LEFT)
  889. add tempK, tempOffset, #4
  890. #else
  891. add tempK, tempOffset, #2
  892. #endif
  893. asr counterL , tempK, #3 // counterL = counterL / 8
  894. cmp counterL,#0
  895. ble .Lctrmm_kernel_L2_M4_40
  896. .align 5
  897. .Lctrmm_kernel_L2_M4_22:
  898. KERNEL4x2_SUB
  899. KERNEL4x2_SUB
  900. KERNEL4x2_SUB
  901. KERNEL4x2_SUB
  902. KERNEL4x2_SUB
  903. KERNEL4x2_SUB
  904. KERNEL4x2_SUB
  905. KERNEL4x2_SUB
  906. subs counterL, counterL, #1
  907. bgt .Lctrmm_kernel_L2_M4_22
  908. .Lctrmm_kernel_L2_M4_40:
  909. ands counterL , tempK, #7 // counterL = counterL % 8
  910. ble .Lctrmm_kernel_L2_M4_100
  911. .Lctrmm_kernel_L2_M4_42:
  912. KERNEL4x2_SUB
  913. subs counterL, counterL, #1
  914. bgt .Lctrmm_kernel_L2_M4_42
  915. .Lctrmm_kernel_L2_M4_100:
  916. SAVE4x2
  917. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  918. sub tempK, origK, tempOffset
  919. #if defined(LEFT)
  920. sub tempK, tempK, #4
  921. #else
  922. sub tempK, tempK, #2
  923. #endif
  924. lsl temp, tempK, #5
  925. add pA, pA, temp
  926. lsl temp, tempK, #4
  927. add pB, pB, temp
  928. #endif
  929. #if defined(LEFT)
  930. add tempOffset, tempOffset, #4
  931. #endif
  932. .Lctrmm_kernel_L2_M4_END:
  933. subs counterI, counterI, #1
  934. bgt .Lctrmm_kernel_L2_M4_20
  935. .Lctrmm_kernel_L2_M2_BEGIN:
  936. mov counterI, origM
  937. tst counterI , #3
  938. ble .Lctrmm_kernel_L2_END
  939. tst counterI, #2 // counterI = counterI / 2
  940. ble .Lctrmm_kernel_L2_M1_BEGIN
  941. .Lctrmm_kernel_L2_M2_20:
  942. INIT2x2
  943. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  944. mov pB, origPB
  945. #else
  946. mov pB, origPB
  947. lsl temp, tempOffset, #4
  948. add pB, pB, temp
  949. lsl temp, tempOffset, #4
  950. add pA, pA, temp
  951. #endif
  952. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  953. sub tempK, origK, tempOffset
  954. #elif defined(LEFT)
  955. add tempK, tempOffset, #2
  956. #else
  957. add tempK, tempOffset, #2
  958. #endif
  959. asr counterL , tempK, #3 // counterL = counterL / 8
  960. cmp counterL,#0
  961. ble .Lctrmm_kernel_L2_M2_40
  962. .Lctrmm_kernel_L2_M2_22:
  963. KERNEL2x2_SUB
  964. KERNEL2x2_SUB
  965. KERNEL2x2_SUB
  966. KERNEL2x2_SUB
  967. KERNEL2x2_SUB
  968. KERNEL2x2_SUB
  969. KERNEL2x2_SUB
  970. KERNEL2x2_SUB
  971. subs counterL, counterL, #1
  972. bgt .Lctrmm_kernel_L2_M2_22
  973. .Lctrmm_kernel_L2_M2_40:
  974. ands counterL , tempK, #7 // counterL = counterL % 8
  975. ble .Lctrmm_kernel_L2_M2_100
  976. .Lctrmm_kernel_L2_M2_42:
  977. KERNEL2x2_SUB
  978. subs counterL, counterL, #1
  979. bgt .Lctrmm_kernel_L2_M2_42
  980. .Lctrmm_kernel_L2_M2_100:
  981. SAVE2x2
  982. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  983. sub tempK, origK, tempOffset
  984. #if defined(LEFT)
  985. sub tempK, tempK, #2
  986. #else
  987. sub tempK, tempK, #2
  988. #endif
  989. lsl temp, tempK, #4
  990. add pA, pA, temp
  991. lsl temp, tempK, #4
  992. add pB, pB, temp
  993. #endif
  994. #if defined(LEFT)
  995. add tempOffset, tempOffset, #2
  996. #endif
  997. .Lctrmm_kernel_L2_M2_END:
  998. .Lctrmm_kernel_L2_M1_BEGIN:
  999. tst counterI, #1 // counterI = counterI % 2
  1000. ble .Lctrmm_kernel_L2_END
  1001. .Lctrmm_kernel_L2_M1_20:
  1002. INIT1x2
  1003. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1004. mov pB, origPB
  1005. #else
  1006. mov pB, origPB
  1007. lsl temp, tempOffset, #4
  1008. add pB, pB, temp
  1009. lsl temp, tempOffset, #3
  1010. add pA, pA, temp
  1011. #endif
  1012. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1013. sub tempK, origK, tempOffset
  1014. #elif defined(LEFT)
  1015. add tempK, tempOffset, #1
  1016. #else
  1017. add tempK, tempOffset, #2
  1018. #endif
  1019. asr counterL , tempK, #3 // counterL = counterL / 8
  1020. cmp counterL, #0
  1021. ble .Lctrmm_kernel_L2_M1_40
  1022. .Lctrmm_kernel_L2_M1_22:
  1023. KERNEL1x2_SUB
  1024. KERNEL1x2_SUB
  1025. KERNEL1x2_SUB
  1026. KERNEL1x2_SUB
  1027. KERNEL1x2_SUB
  1028. KERNEL1x2_SUB
  1029. KERNEL1x2_SUB
  1030. KERNEL1x2_SUB
  1031. subs counterL, counterL, #1
  1032. bgt .Lctrmm_kernel_L2_M1_22
  1033. .Lctrmm_kernel_L2_M1_40:
  1034. ands counterL , tempK, #7 // counterL = counterL % 8
  1035. ble .Lctrmm_kernel_L2_M1_100
  1036. .Lctrmm_kernel_L2_M1_42:
  1037. KERNEL1x2_SUB
  1038. subs counterL, counterL, #1
  1039. bgt .Lctrmm_kernel_L2_M1_42
  1040. .Lctrmm_kernel_L2_M1_100:
  1041. SAVE1x2
  1042. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1043. sub tempK, origK, tempOffset
  1044. #if defined(LEFT)
  1045. sub tempK, tempK, #1
  1046. #else
  1047. sub tempK, tempK, #2
  1048. #endif
  1049. lsl temp, tempK, #3
  1050. add pA, pA, temp
  1051. lsl temp, tempK, #4
  1052. add pB, pB, temp
  1053. #endif
  1054. #if defined(LEFT)
  1055. add tempOffset, tempOffset, #1
  1056. #endif
  1057. .Lctrmm_kernel_L2_END:
  1058. #if !defined(LEFT)
  1059. add tempOffset, tempOffset, #2
  1060. #endif
  1061. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1062. /******************************************************************************/
  1063. .Lctrmm_kernel_L1_BEGIN:
  1064. mov counterJ , origN
  1065. tst counterJ , #1
  1066. ble .Lctrmm_kernel_L999 // done
  1067. mov pCRow0, pC // pCRow0 = C
  1068. add pC , pC , LDC // Update pC to point to next
  1069. #if defined(LEFT)
  1070. mov tempOffset, offset
  1071. #endif
  1072. mov pA, origPA // pA = A
  1073. .Lctrmm_kernel_L1_M4_BEGIN:
  1074. mov counterI, origM
  1075. asr counterI, counterI, #2 // counterI = counterI / 4
  1076. cmp counterI, #0
  1077. ble .Lctrmm_kernel_L1_M2_BEGIN
  1078. .Lctrmm_kernel_L1_M4_20:
  1079. INIT4x1
  1080. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1081. mov pB, origPB
  1082. #else
  1083. mov pB, origPB
  1084. lsl temp, tempOffset, #3
  1085. add pB, pB, temp
  1086. lsl temp, tempOffset, #5
  1087. add pA, pA, temp
  1088. #endif
  1089. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1090. sub tempK, origK, tempOffset
  1091. #elif defined(LEFT)
  1092. add tempK, tempOffset, #4
  1093. #else
  1094. add tempK, tempOffset, #1
  1095. #endif
  1096. asr counterL , tempK, #3 // counterL = counterL / 8
  1097. cmp counterL , #0
  1098. ble .Lctrmm_kernel_L1_M4_40
  1099. .align 5
  1100. .Lctrmm_kernel_L1_M4_22:
  1101. KERNEL4x1_SUB
  1102. KERNEL4x1_SUB
  1103. KERNEL4x1_SUB
  1104. KERNEL4x1_SUB
  1105. KERNEL4x1_SUB
  1106. KERNEL4x1_SUB
  1107. KERNEL4x1_SUB
  1108. KERNEL4x1_SUB
  1109. subs counterL, counterL, #1
  1110. bgt .Lctrmm_kernel_L1_M4_22
  1111. .Lctrmm_kernel_L1_M4_40:
  1112. ands counterL , tempK, #7 // counterL = counterL % 8
  1113. ble .Lctrmm_kernel_L1_M4_100
  1114. .Lctrmm_kernel_L1_M4_42:
  1115. KERNEL4x1_SUB
  1116. subs counterL, counterL, #1
  1117. bgt .Lctrmm_kernel_L1_M4_42
  1118. .Lctrmm_kernel_L1_M4_100:
  1119. SAVE4x1
  1120. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1121. sub tempK, origK, tempOffset
  1122. #if defined(LEFT)
  1123. sub tempK, tempK, #4
  1124. #else
  1125. sub tempK, tempK, #1
  1126. #endif
  1127. lsl temp, tempK, #5
  1128. add pA, pA, temp
  1129. lsl temp, tempK, #3
  1130. add pB, pB, temp
  1131. #endif
  1132. #if defined(LEFT)
  1133. add tempOffset, tempOffset, #4
  1134. #endif
  1135. .Lctrmm_kernel_L1_M4_END:
  1136. subs counterI, counterI, #1
  1137. bgt .Lctrmm_kernel_L1_M4_20
  1138. .Lctrmm_kernel_L1_M2_BEGIN:
  1139. mov counterI, origM
  1140. tst counterI , #3
  1141. ble .Lctrmm_kernel_L1_END
  1142. tst counterI, #2 // counterI = counterI / 2
  1143. ble .Lctrmm_kernel_L1_M1_BEGIN
  1144. .Lctrmm_kernel_L1_M2_20:
  1145. INIT2x1
  1146. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1147. mov pB, origPB
  1148. #else
  1149. mov pB, origPB
  1150. lsl temp, tempOffset, #3
  1151. add pB, pB, temp
  1152. lsl temp, tempOffset, #4
  1153. add pA, pA, temp
  1154. #endif
  1155. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1156. sub tempK, origK, tempOffset
  1157. #elif defined(LEFT)
  1158. add tempK, tempOffset, #2
  1159. #else
  1160. add tempK, tempOffset, #1
  1161. #endif
  1162. asr counterL , tempK, #3 // counterL = counterL / 8
  1163. cmp counterL , #0
  1164. ble .Lctrmm_kernel_L1_M2_40
  1165. .Lctrmm_kernel_L1_M2_22:
  1166. KERNEL2x1_SUB
  1167. KERNEL2x1_SUB
  1168. KERNEL2x1_SUB
  1169. KERNEL2x1_SUB
  1170. KERNEL2x1_SUB
  1171. KERNEL2x1_SUB
  1172. KERNEL2x1_SUB
  1173. KERNEL2x1_SUB
  1174. subs counterL, counterL, #1
  1175. bgt .Lctrmm_kernel_L1_M2_22
  1176. .Lctrmm_kernel_L1_M2_40:
  1177. ands counterL , tempK, #7 // counterL = counterL % 8
  1178. ble .Lctrmm_kernel_L1_M2_100
  1179. .Lctrmm_kernel_L1_M2_42:
  1180. KERNEL2x1_SUB
  1181. subs counterL, counterL, #1
  1182. bgt .Lctrmm_kernel_L1_M2_42
  1183. .Lctrmm_kernel_L1_M2_100:
  1184. SAVE2x1
  1185. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1186. sub tempK, origK, tempOffset
  1187. #if defined(LEFT)
  1188. sub tempK, tempK, #2
  1189. #else
  1190. sub tempK, tempK, #1
  1191. #endif
  1192. lsl temp, tempK, #4
  1193. add pA, pA, temp
  1194. lsl temp, tempK, #3
  1195. add pB, pB, temp
  1196. #endif
  1197. #if defined(LEFT)
  1198. add tempOffset, tempOffset, #2
  1199. #endif
  1200. .Lctrmm_kernel_L1_M2_END:
  1201. .Lctrmm_kernel_L1_M1_BEGIN:
  1202. tst counterI, #1 // counterI = counterI % 2
  1203. ble .Lctrmm_kernel_L1_END
  1204. .Lctrmm_kernel_L1_M1_20:
  1205. INIT1x1
  1206. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1207. mov pB, origPB
  1208. #else
  1209. mov pB, origPB
  1210. lsl temp, tempOffset, #3
  1211. add pB, pB, temp
  1212. lsl temp, tempOffset, #3
  1213. add pA, pA, temp
  1214. #endif
  1215. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1216. sub tempK, origK, tempOffset
  1217. #elif defined(LEFT)
  1218. add tempK, tempOffset, #1
  1219. #else
  1220. add tempK, tempOffset, #1
  1221. #endif
  1222. asr counterL , tempK, #3 // counterL = counterL / 8
  1223. cmp counterL , #0
  1224. ble .Lctrmm_kernel_L1_M1_40
  1225. .Lctrmm_kernel_L1_M1_22:
  1226. KERNEL1x1_SUB
  1227. KERNEL1x1_SUB
  1228. KERNEL1x1_SUB
  1229. KERNEL1x1_SUB
  1230. KERNEL1x1_SUB
  1231. KERNEL1x1_SUB
  1232. KERNEL1x1_SUB
  1233. KERNEL1x1_SUB
  1234. subs counterL, counterL, #1
  1235. bgt .Lctrmm_kernel_L1_M1_22
  1236. .Lctrmm_kernel_L1_M1_40:
  1237. ands counterL , tempK, #7 // counterL = counterL % 8
  1238. ble .Lctrmm_kernel_L1_M1_100
  1239. .Lctrmm_kernel_L1_M1_42:
  1240. KERNEL1x1_SUB
  1241. subs counterL, counterL, #1
  1242. bgt .Lctrmm_kernel_L1_M1_42
  1243. .Lctrmm_kernel_L1_M1_100:
  1244. SAVE1x1
  1245. .Lctrmm_kernel_L1_END:
  1246. .Lctrmm_kernel_L999:
  1247. mov x0, #0 // set return value
  1248. ldp d8, d9, [sp, #(0 * 16)]
  1249. ldp d10, d11, [sp, #(1 * 16)]
  1250. ldp d12, d13, [sp, #(2 * 16)]
  1251. ldp d14, d15, [sp, #(3 * 16)]
  1252. ldp d16, d17, [sp, #(4 * 16)]
  1253. ldp x18, x19, [sp, #(5 * 16)]
  1254. ldp x20, x21, [sp, #(6 * 16)]
  1255. ldp x22, x23, [sp, #(7 * 16)]
  1256. ldp x24, x25, [sp, #(8 * 16)]
  1257. ldp x26, x27, [sp, #(9 * 16)]
  1258. ldr x28, [sp, #(10 * 16)]
  1259. add sp, sp, #(11*16)
  1260. ret
  1261. EPILOGUE