You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_4x8.S 39 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7*/
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define tempOffset x17
  49. #define tempK x18
  50. #define alpha0 d2
  51. #define alphaV0 v2.d[0]
  52. #define alpha1 d3
  53. #define alphaV1 v3.d[0]
  54. #define alpha2 d6
  55. #define alphaV2 v6.d[0]
  56. #define alpha3 d7
  57. #define alphaV3 v7.d[0]
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA
  74. // 16 temp
  75. // 17 tempOffset
  76. // 18 must save tempK
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA00, pA01
  91. //v01 pA02, pA03
  92. //v02 ALPHA0
  93. //v03 ALPHA1
  94. //v04 pA10, pA11
  95. //v05 pA12, pA13
  96. //v06 ALPHA2
  97. //v07 ALPHA3
  98. //v08 must save pB0_0, pB0_1
  99. //v09 must save pB0_2, pB0_3
  100. //v10 must save pB0_4, pB0_5
  101. //v11 must save pB0_6, pB0_7
  102. //v12 must save pB1_0, pB1_1
  103. //v13 must save pB1_2, pB1_3
  104. //v14 must save pB1_4, pB1_5
  105. //v15 must save pB1_6, pB1_7
  106. //v16 must save C00, C01
  107. //v17 must save C02, C03
  108. //v18 C04, C05
  109. //v19 C06, C07
  110. //v20 C10, C11
  111. //v21 C12, C13
  112. //v22 C14, C15
  113. //v23 C16, C17
  114. //v24 C20, C21
  115. //v25 C22, C23
  116. //v26 C24, C25
  117. //v27 C26, C27
  118. //v28 C30, C31
  119. //v29 C32, C33
  120. //v30 C34, C35
  121. //v31 C36, C37
  122. /*******************************************************************************
  123. * Macro definitions
  124. *******************************************************************************/
  125. .macro INIT4x8
  126. fmov d16, xzr
  127. fmov d17, xzr
  128. fmov d18, xzr
  129. fmov d19, d16
  130. fmov d20, xzr
  131. fmov d21, d16
  132. fmov d22, d17
  133. fmov d23, d18
  134. fmov d24, xzr
  135. fmov d25, d16
  136. fmov d26, d17
  137. fmov d27, d18
  138. fmov d28, xzr
  139. fmov d29, d16
  140. fmov d30, d17
  141. fmov d31, d18
  142. .endm
  143. .macro KERNEL4x8_I
  144. ld1 {v8.2d, v9.2d}, [pB]
  145. add pB, pB, #32
  146. ld1 {v0.2d, v1.2d}, [pA]
  147. add pA, pA, #32
  148. ld1 {v10.2d, v11.2d}, [pB]
  149. add pB, pB, #32
  150. fmul v16.2d, v0.2d, v8.d[0]
  151. fmul v17.2d, v1.2d, v8.d[0]
  152. fmul v18.2d, v0.2d, v8.d[1]
  153. fmul v19.2d, v1.2d, v8.d[1]
  154. fmul v20.2d, v0.2d, v9.d[0]
  155. fmul v21.2d, v1.2d, v9.d[0]
  156. fmul v22.2d, v0.2d, v9.d[1]
  157. fmul v23.2d, v1.2d, v9.d[1]
  158. fmul v24.2d, v0.2d, v10.d[0]
  159. fmul v25.2d, v1.2d, v10.d[0]
  160. fmul v26.2d, v0.2d, v10.d[1]
  161. fmul v27.2d, v1.2d, v10.d[1]
  162. fmul v28.2d, v0.2d, v11.d[0]
  163. fmul v29.2d, v1.2d, v11.d[0]
  164. fmul v30.2d, v0.2d, v11.d[1]
  165. fmul v31.2d, v1.2d, v11.d[1]
  166. ld1 {v12.2d, v13.2d}, [pB]
  167. add pB, pB, #32
  168. ld1 {v4.2d, v5.2d}, [pA]
  169. add pA, pA, #32
  170. ld1 {v14.2d, v15.2d}, [pB]
  171. add pB, pB, #32
  172. .endm
  173. .macro KERNEL4x8_M1
  174. fmla v16.2d, v0.2d, v8.d[0]
  175. fmla v17.2d, v1.2d, v8.d[0]
  176. fmla v18.2d, v0.2d, v8.d[1]
  177. fmla v19.2d, v1.2d, v8.d[1]
  178. fmla v20.2d, v0.2d, v9.d[0]
  179. fmla v21.2d, v1.2d, v9.d[0]
  180. fmla v22.2d, v0.2d, v9.d[1]
  181. fmla v23.2d, v1.2d, v9.d[1]
  182. fmla v24.2d, v0.2d, v10.d[0]
  183. fmla v25.2d, v1.2d, v10.d[0]
  184. fmla v26.2d, v0.2d, v10.d[1]
  185. fmla v27.2d, v1.2d, v10.d[1]
  186. fmla v28.2d, v0.2d, v11.d[0]
  187. fmla v29.2d, v1.2d, v11.d[0]
  188. fmla v30.2d, v0.2d, v11.d[1]
  189. fmla v31.2d, v1.2d, v11.d[1]
  190. ld1 {v12.2d, v13.2d}, [pB] // For next round
  191. add pB, pB, #32
  192. ld1 {v4.2d, v5.2d}, [pA] // For next round
  193. add pA, pA, #32
  194. ld1 {v14.2d, v15.2d}, [pB]
  195. add pB, pB, #32
  196. prfm PLDL1KEEP, [pA, #512]
  197. .endm
  198. .macro KERNEL4x8_M2
  199. fmla v16.2d, v4.2d, v12.d[0]
  200. fmla v17.2d, v5.2d, v12.d[0]
  201. fmla v18.2d, v4.2d, v12.d[1]
  202. fmla v19.2d, v5.2d, v12.d[1]
  203. fmla v20.2d, v4.2d, v13.d[0]
  204. fmla v21.2d, v5.2d, v13.d[0]
  205. fmla v22.2d, v4.2d, v13.d[1]
  206. fmla v23.2d, v5.2d, v13.d[1]
  207. fmla v24.2d, v4.2d, v14.d[0]
  208. fmla v25.2d, v5.2d, v14.d[0]
  209. fmla v26.2d, v4.2d, v14.d[1]
  210. fmla v27.2d, v5.2d, v14.d[1]
  211. fmla v28.2d, v4.2d, v15.d[0]
  212. fmla v29.2d, v5.2d, v15.d[0]
  213. fmla v30.2d, v4.2d, v15.d[1]
  214. fmla v31.2d, v5.2d, v15.d[1]
  215. ld1 {v8.2d, v9.2d}, [pB] // For next round
  216. add pB, pB, #32
  217. ld1 {v0.2d, v1.2d}, [pA] // For next round
  218. add pA, pA, #32
  219. ld1 {v10.2d, v11.2d}, [pB]
  220. add pB, pB, #32
  221. prfm PLDL1KEEP, [pB, #512]
  222. .endm
  223. .macro KERNEL4x8_E
  224. fmla v16.2d, v4.2d, v12.d[0]
  225. fmla v17.2d, v5.2d, v12.d[0]
  226. fmla v18.2d, v4.2d, v12.d[1]
  227. fmla v19.2d, v5.2d, v12.d[1]
  228. fmla v20.2d, v4.2d, v13.d[0]
  229. fmla v21.2d, v5.2d, v13.d[0]
  230. fmla v22.2d, v4.2d, v13.d[1]
  231. fmla v23.2d, v5.2d, v13.d[1]
  232. fmla v24.2d, v4.2d, v14.d[0]
  233. fmla v25.2d, v5.2d, v14.d[0]
  234. fmla v26.2d, v4.2d, v14.d[1]
  235. fmla v27.2d, v5.2d, v14.d[1]
  236. fmla v28.2d, v4.2d, v15.d[0]
  237. fmla v29.2d, v5.2d, v15.d[0]
  238. fmla v30.2d, v4.2d, v15.d[1]
  239. fmla v31.2d, v5.2d, v15.d[1]
  240. .endm
  241. .macro KERNEL4x8_SUB
  242. ld1 {v8.2d, v9.2d}, [pB] // For next round
  243. add pB, pB, #32
  244. ld1 {v0.2d, v1.2d}, [pA] // For next round
  245. add pA, pA, #32
  246. ld1 {v10.2d, v11.2d}, [pB]
  247. add pB, pB, #32
  248. fmla v16.2d, v0.2d, v8.d[0]
  249. fmla v17.2d, v1.2d, v8.d[0]
  250. fmla v18.2d, v0.2d, v8.d[1]
  251. fmla v19.2d, v1.2d, v8.d[1]
  252. fmla v20.2d, v0.2d, v9.d[0]
  253. fmla v21.2d, v1.2d, v9.d[0]
  254. fmla v22.2d, v0.2d, v9.d[1]
  255. fmla v23.2d, v1.2d, v9.d[1]
  256. fmla v24.2d, v0.2d, v10.d[0]
  257. fmla v25.2d, v1.2d, v10.d[0]
  258. fmla v26.2d, v0.2d, v10.d[1]
  259. fmla v27.2d, v1.2d, v10.d[1]
  260. fmla v28.2d, v0.2d, v11.d[0]
  261. fmla v29.2d, v1.2d, v11.d[0]
  262. fmla v30.2d, v0.2d, v11.d[1]
  263. fmla v31.2d, v1.2d, v11.d[1]
  264. .endm
  265. .macro SAVE4x8
  266. add pCRow1, pCRow0, LDC
  267. fmul v8.2d, v16.2d, alphaV0
  268. fmul v9.2d, v17.2d, alphaV1
  269. st1 {v8.2d, v9.2d}, [pCRow0]
  270. add pCRow2, pCRow1, LDC
  271. fmul v10.2d, v18.2d, alphaV2
  272. fmul v11.2d, v19.2d, alphaV3
  273. st1 {v10.2d, v11.2d}, [pCRow1]
  274. add pCRow1, pCRow2, LDC
  275. fmul v12.2d, v20.2d, alphaV0
  276. fmul v13.2d, v21.2d, alphaV1
  277. st1 {v12.2d, v13.2d}, [pCRow2]
  278. add pCRow2, pCRow1, LDC
  279. fmul v14.2d, v22.2d, alphaV2
  280. fmul v15.2d, v23.2d, alphaV3
  281. st1 {v14.2d, v15.2d}, [pCRow1]
  282. add pCRow1, pCRow2, LDC
  283. fmul v8.2d, v24.2d, alphaV0
  284. fmul v9.2d, v25.2d, alphaV1
  285. st1 {v8.2d, v9.2d}, [pCRow2]
  286. add pCRow2, pCRow1, LDC
  287. fmul v10.2d, v26.2d, alphaV2
  288. fmul v11.2d, v27.2d, alphaV3
  289. st1 {v10.2d, v11.2d}, [pCRow1]
  290. add pCRow1, pCRow2, LDC
  291. fmul v12.2d, v28.2d, alphaV0
  292. fmul v13.2d, v29.2d, alphaV1
  293. st1 {v12.2d, v13.2d}, [pCRow2]
  294. fmul v14.2d, v30.2d, alphaV2
  295. fmul v15.2d, v31.2d, alphaV3
  296. st1 {v14.2d, v15.2d}, [pCRow1]
  297. add pCRow0, pCRow0, #32
  298. .endm
  299. /******************************************************************************/
  300. .macro INIT2x8
  301. fmov d16, xzr
  302. fmov d18, xzr
  303. fmov d20, xzr
  304. fmov d22, d16
  305. fmov d24, xzr
  306. fmov d26, d16
  307. fmov d28, xzr
  308. fmov d30, d16
  309. .endm
  310. .macro KERNEL2x8_SUB
  311. ld1 {v8.2d, v9.2d}, [pB]
  312. add pB, pB, #32
  313. ld1 {v0.2d}, [pA]
  314. add pA, pA, #16
  315. ld1 {v10.2d, v11.2d}, [pB]
  316. add pB, pB, #32
  317. fmla v16.2d, v0.2d, v8.d[0]
  318. fmla v18.2d, v0.2d, v8.d[1]
  319. fmla v20.2d, v0.2d, v9.d[0]
  320. fmla v22.2d, v0.2d, v9.d[1]
  321. fmla v24.2d, v0.2d, v10.d[0]
  322. fmla v26.2d, v0.2d, v10.d[1]
  323. fmla v28.2d, v0.2d, v11.d[0]
  324. fmla v30.2d, v0.2d, v11.d[1]
  325. .endm
  326. .macro SAVE2x8
  327. add pCRow1, pCRow0, LDC
  328. fmul v8.2d, v16.2d, alphaV0
  329. st1 {v8.2d}, [pCRow0]
  330. add pCRow2, pCRow1, LDC
  331. fmul v10.2d, v18.2d, alphaV2
  332. st1 {v10.2d}, [pCRow1]
  333. add pCRow1, pCRow2, LDC
  334. fmul v12.2d, v20.2d, alphaV0
  335. st1 {v12.2d}, [pCRow2]
  336. add pCRow2, pCRow1, LDC
  337. fmul v14.2d, v22.2d, alphaV2
  338. st1 {v14.2d}, [pCRow1]
  339. add pCRow1, pCRow2, LDC
  340. fmul v8.2d, v24.2d, alphaV0
  341. st1 {v8.2d}, [pCRow2]
  342. add pCRow2, pCRow1, LDC
  343. fmul v10.2d, v26.2d, alphaV2
  344. st1 {v10.2d}, [pCRow1]
  345. add pCRow1, pCRow2, LDC
  346. fmul v12.2d, v28.2d, alphaV0
  347. st1 {v12.2d}, [pCRow2]
  348. add pCRow2, pCRow1, LDC
  349. fmul v14.2d, v30.2d, alphaV2
  350. st1 {v14.2d}, [pCRow1]
  351. add pCRow0, pCRow0, #16
  352. .endm
  353. /******************************************************************************/
  354. .macro INIT1x8
  355. fmov d16, xzr
  356. fmov d20, xzr
  357. fmov d24, xzr
  358. fmov d28, xzr
  359. .endm
  360. .macro KERNEL1x8_SUB
  361. ld1 {v8.2d, v9.2d}, [pB]
  362. add pB, pB, #32
  363. ldr d0, [pA]
  364. add pA, pA, #8
  365. ld1 {v10.2d, v11.2d}, [pB]
  366. add pB, pB, #32
  367. fmla v16.2d, v8.2d, v0.d[0]
  368. fmla v20.2d, v9.2d, v0.d[0]
  369. fmla v24.2d, v10.2d, v0.d[0]
  370. fmla v28.2d, v11.2d, v0.d[0]
  371. .endm
  372. .macro SAVE1x8
  373. add pCRow1, pCRow0, LDC
  374. fmul v8.2d, v16.2d, alphaV0
  375. st1 {v8.d}[0], [pCRow0]
  376. st1 {v8.d}[1], [pCRow1]
  377. add pCRow2, pCRow1, LDC
  378. add pCRow1, pCRow2, LDC
  379. fmul v10.2d, v20.2d, alphaV1
  380. st1 {v10.d}[0], [pCRow2]
  381. st1 {v10.d}[1], [pCRow1]
  382. add pCRow2, pCRow1, LDC
  383. add pCRow1, pCRow2, LDC
  384. fmul v12.2d, v24.2d, alphaV2
  385. st1 {v12.d}[0], [pCRow2]
  386. st1 {v12.d}[1], [pCRow1]
  387. add pCRow2, pCRow1, LDC
  388. add pCRow1, pCRow2, LDC
  389. fmul v14.2d, v28.2d, alphaV3
  390. st1 {v14.d}[0], [pCRow2]
  391. st1 {v14.d}[1], [pCRow1]
  392. add pCRow0, pCRow0, #8
  393. .endm
  394. /******************************************************************************/
  395. .macro INIT4x4
  396. fmov d16, xzr
  397. fmov d17, d16
  398. fmov d20, d17
  399. fmov d21, d16
  400. fmov d24, d17
  401. fmov d25, d16
  402. fmov d28, d17
  403. fmov d29, d16
  404. .endm
  405. .macro KERNEL4x4_I
  406. ld1 {v8.2d, v9.2d}, [pB]
  407. add pB, pB, #32
  408. ld1 {v0.2d, v1.2d}, [pA]
  409. add pA, pA, #32
  410. fmul v16.2d, v0.2d, v8.d[0]
  411. fmul v29.2d, v1.2d, v9.d[1]
  412. fmul v20.2d, v0.2d, v8.d[1]
  413. fmul v25.2d, v1.2d, v9.d[0]
  414. fmul v24.2d, v0.2d, v9.d[0]
  415. fmul v21.2d, v1.2d, v8.d[1]
  416. fmul v28.2d, v0.2d, v9.d[1]
  417. fmul v17.2d, v1.2d, v8.d[0]
  418. ld1 {v12.2d, v13.2d}, [pB]
  419. add pB, pB, #32
  420. ld1 {v4.2d, v5.2d}, [pA]
  421. add pA, pA, #32
  422. .endm
  423. .macro KERNEL4x4_M1
  424. fmla v16.2d, v0.2d, v8.d[0]
  425. fmla v29.2d, v1.2d, v9.d[1]
  426. ld1 {v12.2d, v13.2d}, [pB] // For next round
  427. add pB, pB, #32
  428. fmla v20.2d, v0.2d, v8.d[1]
  429. fmla v25.2d, v1.2d, v9.d[0]
  430. ld1 {v4.2d, v5.2d}, [pA] // For next round
  431. add pA, pA, #32
  432. fmla v24.2d, v0.2d, v9.d[0]
  433. fmla v21.2d, v1.2d, v8.d[1]
  434. prfm PLDL1KEEP, [pA, #512]
  435. fmla v28.2d, v0.2d, v9.d[1]
  436. fmla v17.2d, v1.2d, v8.d[0]
  437. .endm
  438. .macro KERNEL4x4_M2
  439. fmla v16.2d, v4.2d, v12.d[0]
  440. fmla v29.2d, v5.2d, v13.d[1]
  441. ld1 {v8.2d, v9.2d}, [pB] // For next round
  442. add pB, pB, #32
  443. fmla v20.2d, v4.2d, v12.d[1]
  444. fmla v25.2d, v5.2d, v13.d[0]
  445. ld1 {v0.2d, v1.2d}, [pA] // For next round
  446. add pA, pA, #32
  447. fmla v24.2d, v4.2d, v13.d[0]
  448. fmla v21.2d, v5.2d, v12.d[1]
  449. prfm PLDL1KEEP, [pB, #512]
  450. fmla v28.2d, v4.2d, v13.d[1]
  451. fmla v17.2d, v5.2d, v12.d[0]
  452. .endm
  453. .macro KERNEL4x4_E
  454. fmla v16.2d, v4.2d, v12.d[0]
  455. fmla v29.2d, v5.2d, v13.d[1]
  456. fmla v20.2d, v4.2d, v12.d[1]
  457. fmla v25.2d, v5.2d, v13.d[0]
  458. fmla v24.2d, v4.2d, v13.d[0]
  459. fmla v21.2d, v5.2d, v12.d[1]
  460. fmla v28.2d, v4.2d, v13.d[1]
  461. fmla v17.2d, v5.2d, v12.d[0]
  462. .endm
  463. .macro KERNEL4x4_SUB
  464. ld1 {v8.2d, v9.2d}, [pB]
  465. add pB, pB, #32
  466. ld1 {v0.2d, v1.2d}, [pA]
  467. add pA, pA, #32
  468. fmla v16.2d, v0.2d, v8.d[0]
  469. fmla v29.2d, v1.2d, v9.d[1]
  470. fmla v20.2d, v0.2d, v8.d[1]
  471. fmla v25.2d, v1.2d, v9.d[0]
  472. fmla v24.2d, v0.2d, v9.d[0]
  473. fmla v21.2d, v1.2d, v8.d[1]
  474. fmla v28.2d, v0.2d, v9.d[1]
  475. fmla v17.2d, v1.2d, v8.d[0]
  476. .endm
  477. .macro SAVE4x4
  478. fmul v8.2d, v16.2d, alphaV0
  479. fmul v9.2d, v17.2d, alphaV1
  480. st1 {v8.2d, v9.2d}, [pCRow0]
  481. add pCRow1, pCRow0, LDC
  482. fmul v12.2d, v20.2d, alphaV2
  483. fmul v13.2d, v21.2d, alphaV3
  484. st1 {v12.2d, v13.2d}, [pCRow1]
  485. add pCRow2, pCRow1, LDC
  486. fmul v8.2d, v24.2d, alphaV0
  487. fmul v9.2d, v25.2d, alphaV1
  488. st1 {v8.2d, v9.2d}, [pCRow2]
  489. add pCRow1, pCRow2, LDC
  490. fmul v12.2d, v28.2d, alphaV2
  491. fmul v13.2d, v29.2d, alphaV3
  492. st1 {v12.2d, v13.2d}, [pCRow1]
  493. add pCRow0, pCRow0, #32
  494. .endm
  495. /******************************************************************************/
  496. .macro INIT2x4
  497. fmov d16, xzr
  498. fmov d20, d16
  499. fmov d24, d20
  500. fmov d28, d16
  501. .endm
  502. .macro KERNEL2x4_SUB
  503. ld1 {v8.2d, v9.2d}, [pB]
  504. add pB, pB, #32
  505. ld1 {v0.2d}, [pA]
  506. add pA, pA, #16
  507. fmla v16.2d, v0.2d, v8.d[0]
  508. fmla v20.2d, v0.2d, v8.d[1]
  509. fmla v24.2d, v0.2d, v9.d[0]
  510. fmla v28.2d, v0.2d, v9.d[1]
  511. .endm
  512. .macro SAVE2x4
  513. fmul v8.2d, v16.2d, alphaV0
  514. st1 {v8.2d}, [pCRow0]
  515. add pCRow1, pCRow0, LDC
  516. fmul v12.2d, v20.2d, alphaV1
  517. st1 {v12.2d}, [pCRow1]
  518. add pCRow2, pCRow1, LDC
  519. fmul v8.2d, v24.2d, alphaV2
  520. st1 {v8.2d}, [pCRow2]
  521. add pCRow1, pCRow2, LDC
  522. fmul v12.2d, v28.2d, alphaV3
  523. st1 {v12.2d}, [pCRow1]
  524. add pCRow0, pCRow0, #16
  525. .endm
  526. /******************************************************************************/
  527. .macro INIT1x4
  528. fmov d16, xzr
  529. fmov d20, d16
  530. .endm
  531. .macro KERNEL1x4_SUB
  532. ldr d0, [pA]
  533. add pA, pA, #8
  534. ld1 {v8.2d, v9.2d}, [pB]
  535. add pB, pB, #32
  536. fmla v16.2d, v8.2d, v0.d[0]
  537. fmla v20.2d, v9.2d, v0.d[0]
  538. .endm
  539. .macro SAVE1x4
  540. add pCRow1, pCRow0, LDC
  541. fmul v8.2d, v16.2d, alphaV0
  542. st1 {v8.d}[0], [pCRow0]
  543. st1 {v8.d}[1], [pCRow1]
  544. add pCRow2, pCRow1, LDC
  545. add pCRow1, pCRow2, LDC
  546. fmul v12.2d, v20.2d, alphaV1
  547. st1 {v12.d}[0], [pCRow2]
  548. st1 {v12.d}[1], [pCRow1]
  549. add pCRow0, pCRow0, #8
  550. .endm
  551. /******************************************************************************/
  552. .macro INIT4x2
  553. fmov d16, xzr
  554. fmov d17, d16
  555. fmov d20, d17
  556. fmov d21, d16
  557. .endm
  558. .macro KERNEL4x2_SUB
  559. ld1 {v8.2d}, [pB]
  560. add pB, pB, #16
  561. ld1 {v0.2d, v1.2d}, [pA]
  562. add pA, pA, #32
  563. fmla v16.2d, v0.2d, v8.d[0]
  564. fmla v17.2d, v1.2d, v8.d[0]
  565. fmla v20.2d, v0.2d, v8.d[1]
  566. fmla v21.2d, v1.2d, v8.d[1]
  567. .endm
  568. .macro SAVE4x2
  569. fmul v8.2d, v16.2d, alphaV0
  570. fmul v9.2d, v17.2d, alphaV1
  571. st1 {v8.2d, v9.2d}, [pCRow0]
  572. add pCRow1, pCRow0, LDC
  573. fmul v12.2d, v20.2d, alphaV2
  574. fmul v13.2d, v21.2d, alphaV3
  575. st1 {v12.2d, v13.2d}, [pCRow1]
  576. add pCRow0, pCRow0, #32
  577. .endm
  578. /******************************************************************************/
  579. .macro INIT2x2
  580. fmov d16, xzr
  581. fmov d20, d16
  582. .endm
  583. .macro KERNEL2x2_SUB
  584. ld1 {v8.2d}, [pB]
  585. add pB, pB, #16
  586. ld1 {v0.2d}, [pA]
  587. add pA, pA, #16
  588. fmla v16.2d, v0.2d, v8.d[0]
  589. fmla v20.2d, v0.2d, v8.d[1]
  590. .endm
  591. .macro SAVE2x2
  592. fmul v8.2d, v16.2d, alphaV0
  593. st1 {v8.2d}, [pCRow0]
  594. add pCRow1 , pCRow0, LDC
  595. fmul v12.2d, v20.2d, alphaV1
  596. st1 {v12.2d}, [pCRow1]
  597. add pCRow0, pCRow0, #16
  598. .endm
  599. /******************************************************************************/
  600. .macro INIT1x2
  601. fmov d16, xzr
  602. .endm
  603. .macro KERNEL1x2_SUB
  604. ld1 {v8.2d} , [pB]
  605. add pB , pB, #16
  606. ldr d0 , [pA]
  607. add pA, pA, #8
  608. fmla v16.2d, v8.2d, v0.d[0]
  609. .endm
  610. .macro SAVE1x2
  611. add pCRow1 , pCRow0, LDC
  612. fmul v8.2d, v16.2d, alphaV0
  613. st1 {v8.d}[0], [pCRow0]
  614. st1 {v8.d}[1], [pCRow1]
  615. add pCRow0, pCRow0, #8
  616. .endm
  617. /******************************************************************************/
  618. .macro INIT4x1
  619. fmov d16, xzr
  620. fmov d17, d16
  621. .endm
  622. .macro KERNEL4x1_SUB
  623. ldr d8, [pB]
  624. add pB , pB, #8
  625. ld1 {v0.2d, v1.2d}, [pA]
  626. add pA , pA, #32
  627. fmla v16.2d, v0.2d, v8.d[0]
  628. fmla v17.2d, v1.2d, v8.d[0]
  629. .endm
  630. .macro SAVE4x1
  631. fmul v8.2d, v16.2d, alphaV0
  632. fmul v9.2d, v17.2d, alphaV1
  633. st1 {v8.2d, v9.2d}, [pCRow0]
  634. add pCRow0, pCRow0, #32
  635. .endm
  636. /******************************************************************************/
  637. .macro INIT2x1
  638. fmov d16, xzr
  639. .endm
  640. .macro KERNEL2x1_SUB
  641. ldr d8, [pB]
  642. add pB , pB, #8
  643. ld1 {v0.2d}, [pA]
  644. add pA , pA, #16
  645. fmla v16.2d, v0.2d, v8.d[0]
  646. .endm
  647. .macro SAVE2x1
  648. fmul v8.2d, v16.2d, alphaV0
  649. st1 {v8.2d}, [pCRow0]
  650. add pCRow0, pCRow0, #16
  651. .endm
  652. /******************************************************************************/
  653. .macro INIT1x1
  654. fmov d16, xzr
  655. .endm
  656. .macro KERNEL1x1_SUB
  657. ldr d8, [pB]
  658. add pB , pB, #8
  659. ldr d0, [pA]
  660. add pA , pA, #8
  661. fmadd d16, d0, d8, d16
  662. .endm
  663. .macro SAVE1x1
  664. fmul d8, d16, alpha0
  665. str d8, [pCRow0]
  666. add pCRow0, pCRow0, #8
  667. .endm
  668. /*******************************************************************************
  669. * End of macro definitions
  670. *******************************************************************************/
  671. PROLOGUE
  672. .align 5
  673. add sp, sp, #-(11 * 16)
  674. stp d8, d9, [sp, #(0 * 16)]
  675. stp d10, d11, [sp, #(1 * 16)]
  676. stp d12, d13, [sp, #(2 * 16)]
  677. stp d14, d15, [sp, #(3 * 16)]
  678. stp d16, d17, [sp, #(4 * 16)]
  679. stp x18, x19, [sp, #(5 * 16)]
  680. stp x20, x21, [sp, #(6 * 16)]
  681. stp x22, x23, [sp, #(7 * 16)]
  682. stp x24, x25, [sp, #(8 * 16)]
  683. stp x26, x27, [sp, #(9 * 16)]
  684. str x28, [sp, #(10 * 16)]
  685. fmov alpha0, d0
  686. fmov alpha1, d0
  687. fmov alpha2, d0
  688. fmov alpha3, d0
  689. lsl LDC, LDC, #3 // ldc = ldc * 8
  690. #if !defined(LEFT)
  691. neg tempOffset, offset
  692. #endif
  693. mov pB, origPB
  694. mov counterJ, origN
  695. asr counterJ, counterJ, #3 // J = J / 8
  696. cmp counterJ, #0
  697. ble .Ldtrmm_kernel_L4_BEGIN
  698. /******************************************************************************/
  699. .Ldtrmm_kernel_L8_BEGIN:
  700. mov pCRow0, pC // pCRow0 = C
  701. add pC, pC, LDC, lsl #3
  702. #if defined(LEFT)
  703. mov tempOffset, offset
  704. #endif
  705. mov pA, origPA // pA = start of A array
  706. .Ldtrmm_kernel_L8_M4_BEGIN:
  707. mov counterI, origM
  708. asr counterI, counterI, #2 // counterI = counterI / 4
  709. cmp counterI, #0
  710. ble .Ldtrmm_kernel_L8_M2_BEGIN
  711. .Ldtrmm_kernel_L8_M4_20:
  712. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  713. mov pB, origPB
  714. #else
  715. mov pB, origPB
  716. lsl temp, tempOffset, #5
  717. add pA, pA, temp
  718. lsl temp, tempOffset, #6
  719. add pB, pB, temp
  720. #endif
  721. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  722. sub tempK, origK, tempOffset
  723. #elif defined(LEFT)
  724. add tempK, tempOffset, #4
  725. #else
  726. add tempK, tempOffset, #8
  727. #endif
  728. asr counterL, tempK, #1 // L = K / 2
  729. cmp counterL , #2 // is there at least 4 to do?
  730. blt .Ldtrmm_kernel_L8_M4_32
  731. KERNEL4x8_I // do one in the K
  732. KERNEL4x8_M2 // do another in the K
  733. subs counterL, counterL, #2
  734. ble .Ldtrmm_kernel_L8_M4_22a
  735. .align 5
  736. .Ldtrmm_kernel_L8_M4_22:
  737. KERNEL4x8_M1
  738. KERNEL4x8_M2
  739. subs counterL, counterL, #1
  740. bgt .Ldtrmm_kernel_L8_M4_22
  741. .Ldtrmm_kernel_L8_M4_22a:
  742. KERNEL4x8_M1
  743. KERNEL4x8_E
  744. b .Ldtrmm_kernel_L8_M4_44
  745. .Ldtrmm_kernel_L8_M4_32:
  746. tst counterL, #1
  747. ble .Ldtrmm_kernel_L8_M4_40
  748. KERNEL4x8_I
  749. KERNEL4x8_E
  750. b .Ldtrmm_kernel_L8_M4_44
  751. .Ldtrmm_kernel_L8_M4_40:
  752. INIT4x8
  753. .Ldtrmm_kernel_L8_M4_44:
  754. ands counterL, tempK, #1
  755. ble .Ldtrmm_kernel_L8_M4_100
  756. .Ldtrmm_kernel_L8_M4_46:
  757. KERNEL4x8_SUB
  758. .Ldtrmm_kernel_L8_M4_100:
  759. SAVE4x8
  760. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  761. sub tempK, origK, tempOffset
  762. #if defined(LEFT)
  763. sub tempK, tempK, #4
  764. #else
  765. sub tempK, tempK, #8
  766. #endif
  767. lsl temp, tempK, #5
  768. add pA, pA, temp
  769. lsl temp, tempK, #6
  770. add pB, pB, temp
  771. #endif
  772. #if defined(LEFT)
  773. add tempOffset, tempOffset, #4
  774. #endif
  775. .Ldtrmm_kernel_L8_M4_END:
  776. subs counterI, counterI, #1
  777. bne .Ldtrmm_kernel_L8_M4_20
  778. .Ldtrmm_kernel_L8_M2_BEGIN:
  779. mov counterI, origM
  780. tst counterI , #3
  781. ble .Ldtrmm_kernel_L8_END
  782. tst counterI, #2 // counterI = counterI / 2
  783. ble .Ldtrmm_kernel_L8_M1_BEGIN
  784. .Ldtrmm_kernel_L8_M2_20:
  785. INIT2x8
  786. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  787. mov pB, origPB
  788. #else
  789. mov pB, origPB
  790. lsl temp, tempOffset, #4
  791. add pA, pA, temp
  792. lsl temp, tempOffset, #6
  793. add pB, pB, temp
  794. #endif
  795. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  796. sub tempK, origK, tempOffset
  797. #elif defined(LEFT)
  798. add tempK, tempOffset, #2
  799. #else
  800. add tempK, tempOffset, #8
  801. #endif
  802. asr counterL, tempK, #3 // counterL = counterL / 8
  803. cmp counterL , #0
  804. ble .Ldtrmm_kernel_L8_M2_40
  805. .Ldtrmm_kernel_L8_M2_22:
  806. KERNEL2x8_SUB
  807. KERNEL2x8_SUB
  808. KERNEL2x8_SUB
  809. KERNEL2x8_SUB
  810. KERNEL2x8_SUB
  811. KERNEL2x8_SUB
  812. KERNEL2x8_SUB
  813. KERNEL2x8_SUB
  814. subs counterL, counterL, #1
  815. bgt .Ldtrmm_kernel_L8_M2_22
  816. .Ldtrmm_kernel_L8_M2_40:
  817. ands counterL, tempK, #7 // counterL = counterL % 8
  818. ble .Ldtrmm_kernel_L8_M2_100
  819. .Ldtrmm_kernel_L8_M2_42:
  820. KERNEL2x8_SUB
  821. subs counterL, counterL, #1
  822. bgt .Ldtrmm_kernel_L8_M2_42
  823. .Ldtrmm_kernel_L8_M2_100:
  824. SAVE2x8
  825. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  826. sub tempK, origK, tempOffset
  827. #if defined(LEFT)
  828. sub tempK, tempK, #2
  829. #else
  830. sub tempK, tempK, #8
  831. #endif
  832. lsl temp, tempK, #4
  833. add pA, pA, temp
  834. lsl temp, tempK, #6
  835. add pB, pB, temp
  836. #endif
  837. #if defined(LEFT)
  838. add tempOffset, tempOffset, #2
  839. #endif
  840. .Ldtrmm_kernel_L8_M2_END:
  841. .Ldtrmm_kernel_L8_M1_BEGIN:
  842. tst counterI, #1 // counterI = counterI % 2
  843. ble .Ldtrmm_kernel_L8_END
  844. .Ldtrmm_kernel_L8_M1_20:
  845. INIT1x8
  846. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  847. mov pB, origPB
  848. #else
  849. mov pB, origPB
  850. lsl temp, tempOffset, #3
  851. add pA, pA, temp
  852. lsl temp, tempOffset, #6
  853. add pB, pB, temp
  854. #endif
  855. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  856. sub tempK, origK, tempOffset
  857. #elif defined(LEFT)
  858. add tempK, tempOffset, #1
  859. #else
  860. add tempK, tempOffset, #8
  861. #endif
  862. asr counterL, tempK, #3 // counterL = counterL / 8
  863. cmp counterL , #0
  864. ble .Ldtrmm_kernel_L8_M1_40
  865. .Ldtrmm_kernel_L8_M1_22:
  866. KERNEL1x8_SUB
  867. KERNEL1x8_SUB
  868. KERNEL1x8_SUB
  869. KERNEL1x8_SUB
  870. KERNEL1x8_SUB
  871. KERNEL1x8_SUB
  872. KERNEL1x8_SUB
  873. KERNEL1x8_SUB
  874. subs counterL, counterL, #1
  875. bgt .Ldtrmm_kernel_L8_M1_22
  876. .Ldtrmm_kernel_L8_M1_40:
  877. ands counterL, tempK, #7 // counterL = counterL % 8
  878. ble .Ldtrmm_kernel_L8_M1_100
  879. .Ldtrmm_kernel_L8_M1_42:
  880. KERNEL1x8_SUB
  881. subs counterL, counterL, #1
  882. bgt .Ldtrmm_kernel_L8_M1_42
  883. .Ldtrmm_kernel_L8_M1_100:
  884. SAVE1x8
  885. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  886. sub tempK, origK, tempOffset
  887. #if defined(LEFT)
  888. sub tempK, tempK, #1
  889. #else
  890. sub tempK, tempK, #8
  891. #endif
  892. lsl temp, tempK, #3
  893. add pA, pA, temp
  894. lsl temp, tempK, #6
  895. add pB, pB, temp
  896. #endif
  897. #if defined(LEFT)
  898. add tempOffset, tempOffset, #1
  899. #endif
  900. .Ldtrmm_kernel_L8_END:
  901. lsl temp, origK, #6
  902. add origPB, origPB, temp // B = B + K * 8 * 8
  903. #if !defined(LEFT)
  904. add tempOffset, tempOffset, #8
  905. #endif
  906. subs counterJ, counterJ , #1 // j--
  907. bgt .Ldtrmm_kernel_L8_BEGIN
  908. /******************************************************************************/
  909. .Ldtrmm_kernel_L4_BEGIN:
  910. mov counterJ , origN
  911. tst counterJ , #7
  912. ble .Ldtrmm_kernel_L999
  913. tst counterJ , #4
  914. ble .Ldtrmm_kernel_L2_BEGIN
  915. mov pCRow0, pC // pCRow0 = C
  916. add pC, pC, LDC, lsl #2
  917. #if defined(LEFT)
  918. mov tempOffset, offset
  919. #endif
  920. mov pA, origPA // pA = start of A array
  921. .Ldtrmm_kernel_L4_M4_BEGIN:
  922. mov counterI, origM
  923. asr counterI, counterI, #2 // counterI = counterI / 4
  924. cmp counterI, #0
  925. ble .Ldtrmm_kernel_L4_M2_BEGIN
  926. .Ldtrmm_kernel_L4_M4_20:
  927. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  928. mov pB, origPB
  929. #else
  930. mov pB, origPB
  931. lsl temp, tempOffset, #5
  932. add pB, pB, temp
  933. add pA, pA, temp
  934. #endif
  935. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  936. sub tempK, origK, tempOffset
  937. #elif defined(LEFT)
  938. add tempK, tempOffset, #4
  939. #else
  940. add tempK, tempOffset, #4
  941. #endif
  942. asr counterL, tempK, #1 // L = K / 2
  943. cmp counterL , #2 // is there at least 4 to do?
  944. blt .Ldtrmm_kernel_L4_M4_32
  945. KERNEL4x4_I // do one in the K
  946. KERNEL4x4_M2 // do another in the K
  947. subs counterL, counterL, #2
  948. ble .Ldtrmm_kernel_L4_M4_22a
  949. .align 5
  950. .Ldtrmm_kernel_L4_M4_22:
  951. KERNEL4x4_M1
  952. KERNEL4x4_M2
  953. subs counterL, counterL, #1
  954. bgt .Ldtrmm_kernel_L4_M4_22
  955. .Ldtrmm_kernel_L4_M4_22a:
  956. KERNEL4x4_M1
  957. KERNEL4x4_E
  958. b .Ldtrmm_kernel_L4_M4_44
  959. .Ldtrmm_kernel_L4_M4_32:
  960. tst counterL, #1
  961. ble .Ldtrmm_kernel_L4_M4_40
  962. KERNEL4x4_I
  963. KERNEL4x4_E
  964. b .Ldtrmm_kernel_L4_M4_44
  965. .Ldtrmm_kernel_L4_M4_40:
  966. INIT4x4
  967. .Ldtrmm_kernel_L4_M4_44:
  968. ands counterL , tempK, #1
  969. ble .Ldtrmm_kernel_L4_M4_100
  970. .Ldtrmm_kernel_L4_M4_46:
  971. KERNEL4x4_SUB
  972. .Ldtrmm_kernel_L4_M4_100:
  973. SAVE4x4
  974. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  975. sub tempK, origK, tempOffset
  976. #if defined(LEFT)
  977. sub tempK, tempK, #4
  978. #else
  979. sub tempK, tempK, #4
  980. #endif
  981. lsl temp, tempK, #5
  982. add pA, pA, temp
  983. add pB, pB, temp
  984. #endif
  985. #if defined(LEFT)
  986. add tempOffset, tempOffset, #4
  987. #endif
  988. .Ldtrmm_kernel_L4_M4_END:
  989. subs counterI, counterI, #1
  990. bne .Ldtrmm_kernel_L4_M4_20
  991. .Ldtrmm_kernel_L4_M2_BEGIN:
  992. mov counterI, origM
  993. tst counterI , #3
  994. ble .Ldtrmm_kernel_L4_END
  995. tst counterI, #2 // counterI = counterI / 2
  996. ble .Ldtrmm_kernel_L4_M1_BEGIN
  997. .Ldtrmm_kernel_L4_M2_20:
  998. INIT2x4
  999. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1000. mov pB, origPB
  1001. #else
  1002. mov pB, origPB
  1003. lsl temp, tempOffset, #4
  1004. add pA, pA, temp
  1005. lsl temp, tempOffset, #5
  1006. add pB, pB, temp
  1007. #endif
  1008. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1009. sub tempK, origK, tempOffset
  1010. #elif defined(LEFT)
  1011. add tempK, tempOffset, #2
  1012. #else
  1013. add tempK, tempOffset, #4
  1014. #endif
  1015. asr counterL , tempK, #3 // counterL = counterL / 8
  1016. cmp counterL , #0
  1017. ble .Ldtrmm_kernel_L4_M2_40
  1018. .Ldtrmm_kernel_L4_M2_22:
  1019. KERNEL2x4_SUB
  1020. KERNEL2x4_SUB
  1021. KERNEL2x4_SUB
  1022. KERNEL2x4_SUB
  1023. KERNEL2x4_SUB
  1024. KERNEL2x4_SUB
  1025. KERNEL2x4_SUB
  1026. KERNEL2x4_SUB
  1027. subs counterL, counterL, #1
  1028. bgt .Ldtrmm_kernel_L4_M2_22
  1029. .Ldtrmm_kernel_L4_M2_40:
  1030. ands counterL , tempK, #7 // counterL = counterL % 8
  1031. ble .Ldtrmm_kernel_L4_M2_100
  1032. .Ldtrmm_kernel_L4_M2_42:
  1033. KERNEL2x4_SUB
  1034. subs counterL, counterL, #1
  1035. bgt .Ldtrmm_kernel_L4_M2_42
  1036. .Ldtrmm_kernel_L4_M2_100:
  1037. SAVE2x4
  1038. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1039. sub tempK, origK, tempOffset
  1040. #if defined(LEFT)
  1041. sub tempK, tempK, #2
  1042. #else
  1043. sub tempK, tempK, #4
  1044. #endif
  1045. lsl temp, tempK, #4
  1046. add pA, pA, temp
  1047. lsl temp, tempK, #5
  1048. add pB, pB, temp
  1049. #endif
  1050. #if defined(LEFT)
  1051. add tempOffset, tempOffset, #2
  1052. #endif
  1053. .Ldtrmm_kernel_L4_M2_END:
  1054. .Ldtrmm_kernel_L4_M1_BEGIN:
  1055. tst counterI, #1 // counterI = counterI % 2
  1056. ble .Ldtrmm_kernel_L4_END
  1057. .Ldtrmm_kernel_L4_M1_20:
  1058. INIT1x4
  1059. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1060. mov pB, origPB
  1061. #else
  1062. mov pB, origPB
  1063. lsl temp, tempOffset, #5
  1064. add pB, pB, temp
  1065. lsl temp, tempOffset, #3
  1066. add pA, pA, temp
  1067. #endif
  1068. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1069. sub tempK, origK, tempOffset
  1070. #elif defined(LEFT)
  1071. add tempK, tempOffset, #1
  1072. #else
  1073. add tempK, tempOffset, #4
  1074. #endif
  1075. asr counterL , tempK, #3 // counterL = counterL / 8
  1076. cmp counterL , #0
  1077. ble .Ldtrmm_kernel_L4_M1_40
  1078. .Ldtrmm_kernel_L4_M1_22:
  1079. KERNEL1x4_SUB
  1080. KERNEL1x4_SUB
  1081. KERNEL1x4_SUB
  1082. KERNEL1x4_SUB
  1083. KERNEL1x4_SUB
  1084. KERNEL1x4_SUB
  1085. KERNEL1x4_SUB
  1086. KERNEL1x4_SUB
  1087. subs counterL, counterL, #1
  1088. bgt .Ldtrmm_kernel_L4_M1_22
  1089. .Ldtrmm_kernel_L4_M1_40:
  1090. ands counterL , tempK, #7 // counterL = counterL % 8
  1091. ble .Ldtrmm_kernel_L4_M1_100
  1092. .Ldtrmm_kernel_L4_M1_42:
  1093. KERNEL1x4_SUB
  1094. subs counterL, counterL, #1
  1095. bgt .Ldtrmm_kernel_L4_M1_42
  1096. .Ldtrmm_kernel_L4_M1_100:
  1097. SAVE1x4
  1098. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1099. sub tempK, origK, tempOffset
  1100. #if defined(LEFT)
  1101. sub tempK, tempK, #1
  1102. #else
  1103. sub tempK, tempK, #4
  1104. #endif
  1105. lsl temp, tempK, #3
  1106. add pA, pA, temp
  1107. lsl temp, tempK, #5
  1108. add pB, pB, temp
  1109. #endif
  1110. #if defined(LEFT)
  1111. add tempOffset, tempOffset, #1
  1112. #endif
  1113. .Ldtrmm_kernel_L4_END:
  1114. lsl temp, origK, #5
  1115. add origPB, origPB, temp // B = B + K * 4 * 8
  1116. #if !defined(LEFT)
  1117. add tempOffset, tempOffset, #4
  1118. #endif
  1119. /******************************************************************************/
  1120. .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1121. mov counterJ , origN
  1122. tst counterJ , #3
  1123. ble .Ldtrmm_kernel_L999 // error, N was less than 4?
  1124. tst counterJ , #2
  1125. ble .Ldtrmm_kernel_L1_BEGIN
  1126. mov pCRow0, pC // pCRow0 = pC
  1127. add pC,pC,LDC, lsl #1
  1128. #if defined(LEFT)
  1129. mov tempOffset, offset
  1130. #endif
  1131. mov pA, origPA // pA = A
  1132. .Ldtrmm_kernel_L2_M4_BEGIN:
  1133. mov counterI, origM
  1134. asr counterI, counterI, #2 // counterI = counterI / 4
  1135. cmp counterI,#0
  1136. ble .Ldtrmm_kernel_L2_M2_BEGIN
  1137. .Ldtrmm_kernel_L2_M4_20:
  1138. INIT4x2
  1139. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1140. mov pB, origPB
  1141. #else
  1142. mov pB, origPB
  1143. lsl temp, tempOffset, #4
  1144. add pB, pB, temp
  1145. lsl temp, tempOffset, #5
  1146. add pA, pA, temp
  1147. #endif
  1148. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1149. sub tempK, origK, tempOffset
  1150. #elif defined(LEFT)
  1151. add tempK, tempOffset, #4
  1152. #else
  1153. add tempK, tempOffset, #2
  1154. #endif
  1155. asr counterL , tempK, #3 // counterL = counterL / 8
  1156. cmp counterL,#0
  1157. ble .Ldtrmm_kernel_L2_M4_40
  1158. .align 5
  1159. .Ldtrmm_kernel_L2_M4_22:
  1160. KERNEL4x2_SUB
  1161. KERNEL4x2_SUB
  1162. KERNEL4x2_SUB
  1163. KERNEL4x2_SUB
  1164. KERNEL4x2_SUB
  1165. KERNEL4x2_SUB
  1166. KERNEL4x2_SUB
  1167. KERNEL4x2_SUB
  1168. subs counterL, counterL, #1
  1169. bgt .Ldtrmm_kernel_L2_M4_22
  1170. .Ldtrmm_kernel_L2_M4_40:
  1171. ands counterL , tempK, #7 // counterL = counterL % 8
  1172. ble .Ldtrmm_kernel_L2_M4_100
  1173. .Ldtrmm_kernel_L2_M4_42:
  1174. KERNEL4x2_SUB
  1175. subs counterL, counterL, #1
  1176. bgt .Ldtrmm_kernel_L2_M4_42
  1177. .Ldtrmm_kernel_L2_M4_100:
  1178. SAVE4x2
  1179. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1180. sub tempK, origK, tempOffset
  1181. #if defined(LEFT)
  1182. sub tempK, tempK, #4
  1183. #else
  1184. sub tempK, tempK, #2
  1185. #endif
  1186. lsl temp, tempK, #5
  1187. add pA, pA, temp
  1188. lsl temp, tempK, #4
  1189. add pB, pB, temp
  1190. #endif
  1191. #if defined(LEFT)
  1192. add tempOffset, tempOffset, #4
  1193. #endif
  1194. .Ldtrmm_kernel_L2_M4_END:
  1195. subs counterI, counterI, #1
  1196. bgt .Ldtrmm_kernel_L2_M4_20
  1197. .Ldtrmm_kernel_L2_M2_BEGIN:
  1198. mov counterI, origM
  1199. tst counterI , #3
  1200. ble .Ldtrmm_kernel_L2_END
  1201. tst counterI, #2 // counterI = counterI / 2
  1202. ble .Ldtrmm_kernel_L2_M1_BEGIN
  1203. .Ldtrmm_kernel_L2_M2_20:
  1204. INIT2x2
  1205. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1206. mov pB, origPB
  1207. #else
  1208. mov pB, origPB
  1209. lsl temp, tempOffset, #4
  1210. add pB, pB, temp
  1211. lsl temp, tempOffset, #4
  1212. add pA, pA, temp
  1213. #endif
  1214. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1215. sub tempK, origK, tempOffset
  1216. #elif defined(LEFT)
  1217. add tempK, tempOffset, #2
  1218. #else
  1219. add tempK, tempOffset, #2
  1220. #endif
  1221. asr counterL , tempK, #3 // counterL = counterL / 8
  1222. cmp counterL,#0
  1223. ble .Ldtrmm_kernel_L2_M2_40
  1224. .Ldtrmm_kernel_L2_M2_22:
  1225. KERNEL2x2_SUB
  1226. KERNEL2x2_SUB
  1227. KERNEL2x2_SUB
  1228. KERNEL2x2_SUB
  1229. KERNEL2x2_SUB
  1230. KERNEL2x2_SUB
  1231. KERNEL2x2_SUB
  1232. KERNEL2x2_SUB
  1233. subs counterL, counterL, #1
  1234. bgt .Ldtrmm_kernel_L2_M2_22
  1235. .Ldtrmm_kernel_L2_M2_40:
  1236. ands counterL , tempK, #7 // counterL = counterL % 8
  1237. ble .Ldtrmm_kernel_L2_M2_100
  1238. .Ldtrmm_kernel_L2_M2_42:
  1239. KERNEL2x2_SUB
  1240. subs counterL, counterL, #1
  1241. bgt .Ldtrmm_kernel_L2_M2_42
  1242. .Ldtrmm_kernel_L2_M2_100:
  1243. SAVE2x2
  1244. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1245. sub tempK, origK, tempOffset
  1246. #if defined(LEFT)
  1247. sub tempK, tempK, #2
  1248. #else
  1249. sub tempK, tempK, #2
  1250. #endif
  1251. lsl temp, tempK, #4
  1252. add pA, pA, temp
  1253. lsl temp, tempK, #4
  1254. add pB, pB, temp
  1255. #endif
  1256. #if defined(LEFT)
  1257. add tempOffset, tempOffset, #2
  1258. #endif
  1259. .Ldtrmm_kernel_L2_M2_END:
  1260. .Ldtrmm_kernel_L2_M1_BEGIN:
  1261. tst counterI, #1 // counterI = counterI % 2
  1262. ble .Ldtrmm_kernel_L2_END
  1263. .Ldtrmm_kernel_L2_M1_20:
  1264. INIT1x2
  1265. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1266. mov pB, origPB
  1267. #else
  1268. mov pB, origPB
  1269. lsl temp, tempOffset, #4
  1270. add pB, pB, temp
  1271. lsl temp, tempOffset, #3
  1272. add pA, pA, temp
  1273. #endif
  1274. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1275. sub tempK, origK, tempOffset
  1276. #elif defined(LEFT)
  1277. add tempK, tempOffset, #1
  1278. #else
  1279. add tempK, tempOffset, #2
  1280. #endif
  1281. asr counterL , tempK, #3 // counterL = counterL / 8
  1282. cmp counterL, #0
  1283. ble .Ldtrmm_kernel_L2_M1_40
  1284. .Ldtrmm_kernel_L2_M1_22:
  1285. KERNEL1x2_SUB
  1286. KERNEL1x2_SUB
  1287. KERNEL1x2_SUB
  1288. KERNEL1x2_SUB
  1289. KERNEL1x2_SUB
  1290. KERNEL1x2_SUB
  1291. KERNEL1x2_SUB
  1292. KERNEL1x2_SUB
  1293. subs counterL, counterL, #1
  1294. bgt .Ldtrmm_kernel_L2_M1_22
  1295. .Ldtrmm_kernel_L2_M1_40:
  1296. ands counterL , tempK, #7 // counterL = counterL % 8
  1297. ble .Ldtrmm_kernel_L2_M1_100
  1298. .Ldtrmm_kernel_L2_M1_42:
  1299. KERNEL1x2_SUB
  1300. subs counterL, counterL, #1
  1301. bgt .Ldtrmm_kernel_L2_M1_42
  1302. .Ldtrmm_kernel_L2_M1_100:
  1303. SAVE1x2
  1304. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1305. sub tempK, origK, tempOffset
  1306. #if defined(LEFT)
  1307. sub tempK, tempK, #1
  1308. #else
  1309. sub tempK, tempK, #2
  1310. #endif
  1311. lsl temp, tempK, #3
  1312. add pA, pA, temp
  1313. lsl temp, tempK, #4
  1314. add pB, pB, temp
  1315. #endif
  1316. #if defined(LEFT)
  1317. add tempOffset, tempOffset, #1
  1318. #endif
  1319. .Ldtrmm_kernel_L2_END:
  1320. #if !defined(LEFT)
  1321. add tempOffset, tempOffset, #2
  1322. #endif
  1323. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1324. /******************************************************************************/
  1325. .Ldtrmm_kernel_L1_BEGIN:
  1326. mov counterJ , origN
  1327. tst counterJ , #1
  1328. ble .Ldtrmm_kernel_L999 // done
  1329. mov pCRow0, pC // pCRow0 = C
  1330. add pC , pC , LDC // Update pC to point to next
  1331. #if defined(LEFT)
  1332. mov tempOffset, offset
  1333. #endif
  1334. mov pA, origPA // pA = A
  1335. .Ldtrmm_kernel_L1_M4_BEGIN:
  1336. mov counterI, origM
  1337. asr counterI, counterI, #2 // counterI = counterI / 4
  1338. cmp counterI, #0
  1339. ble .Ldtrmm_kernel_L1_M2_BEGIN
  1340. .Ldtrmm_kernel_L1_M4_20:
  1341. INIT4x1
  1342. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1343. mov pB, origPB
  1344. #else
  1345. mov pB, origPB
  1346. lsl temp, tempOffset, #3
  1347. add pB, pB, temp
  1348. lsl temp, tempOffset, #5
  1349. add pA, pA, temp
  1350. #endif
  1351. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1352. sub tempK, origK, tempOffset
  1353. #elif defined(LEFT)
  1354. add tempK, tempOffset, #4
  1355. #else
  1356. add tempK, tempOffset, #1
  1357. #endif
  1358. asr counterL , tempK, #3 // counterL = counterL / 8
  1359. cmp counterL , #0
  1360. ble .Ldtrmm_kernel_L1_M4_40
  1361. .align 5
  1362. .Ldtrmm_kernel_L1_M4_22:
  1363. KERNEL4x1_SUB
  1364. KERNEL4x1_SUB
  1365. KERNEL4x1_SUB
  1366. KERNEL4x1_SUB
  1367. KERNEL4x1_SUB
  1368. KERNEL4x1_SUB
  1369. KERNEL4x1_SUB
  1370. KERNEL4x1_SUB
  1371. subs counterL, counterL, #1
  1372. bgt .Ldtrmm_kernel_L1_M4_22
  1373. .Ldtrmm_kernel_L1_M4_40:
  1374. ands counterL , tempK, #7 // counterL = counterL % 8
  1375. ble .Ldtrmm_kernel_L1_M4_100
  1376. .Ldtrmm_kernel_L1_M4_42:
  1377. KERNEL4x1_SUB
  1378. subs counterL, counterL, #1
  1379. bgt .Ldtrmm_kernel_L1_M4_42
  1380. .Ldtrmm_kernel_L1_M4_100:
  1381. SAVE4x1
  1382. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1383. sub tempK, origK, tempOffset
  1384. #if defined(LEFT)
  1385. sub tempK, tempK, #4
  1386. #else
  1387. sub tempK, tempK, #1
  1388. #endif
  1389. lsl temp, tempK, #5
  1390. add pA, pA, temp
  1391. lsl temp, tempK, #3
  1392. add pB, pB, temp
  1393. #endif
  1394. #if defined(LEFT)
  1395. add tempOffset, tempOffset, #4
  1396. #endif
  1397. .Ldtrmm_kernel_L1_M4_END:
  1398. subs counterI, counterI, #1
  1399. bgt .Ldtrmm_kernel_L1_M4_20
  1400. .Ldtrmm_kernel_L1_M2_BEGIN:
  1401. mov counterI, origM
  1402. tst counterI , #3
  1403. ble .Ldtrmm_kernel_L1_END
  1404. tst counterI, #2 // counterI = counterI / 2
  1405. ble .Ldtrmm_kernel_L1_M1_BEGIN
  1406. .Ldtrmm_kernel_L1_M2_20:
  1407. INIT2x1
  1408. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1409. mov pB, origPB
  1410. #else
  1411. mov pB, origPB
  1412. lsl temp, tempOffset, #3
  1413. add pB, pB, temp
  1414. lsl temp, tempOffset, #4
  1415. add pA, pA, temp
  1416. #endif
  1417. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1418. sub tempK, origK, tempOffset
  1419. #elif defined(LEFT)
  1420. add tempK, tempOffset, #2
  1421. #else
  1422. add tempK, tempOffset, #1
  1423. #endif
  1424. asr counterL , tempK, #3 // counterL = counterL / 8
  1425. cmp counterL , #0
  1426. ble .Ldtrmm_kernel_L1_M2_40
  1427. .Ldtrmm_kernel_L1_M2_22:
  1428. KERNEL2x1_SUB
  1429. KERNEL2x1_SUB
  1430. KERNEL2x1_SUB
  1431. KERNEL2x1_SUB
  1432. KERNEL2x1_SUB
  1433. KERNEL2x1_SUB
  1434. KERNEL2x1_SUB
  1435. KERNEL2x1_SUB
  1436. subs counterL, counterL, #1
  1437. bgt .Ldtrmm_kernel_L1_M2_22
  1438. .Ldtrmm_kernel_L1_M2_40:
  1439. ands counterL , tempK, #7 // counterL = counterL % 8
  1440. ble .Ldtrmm_kernel_L1_M2_100
  1441. .Ldtrmm_kernel_L1_M2_42:
  1442. KERNEL2x1_SUB
  1443. subs counterL, counterL, #1
  1444. bgt .Ldtrmm_kernel_L1_M2_42
  1445. .Ldtrmm_kernel_L1_M2_100:
  1446. SAVE2x1
  1447. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1448. sub tempK, origK, tempOffset
  1449. #if defined(LEFT)
  1450. sub tempK, tempK, #2
  1451. #else
  1452. sub tempK, tempK, #1
  1453. #endif
  1454. lsl temp, tempK, #4
  1455. add pA, pA, temp
  1456. lsl temp, tempK, #3
  1457. add pB, pB, temp
  1458. #endif
  1459. #if defined(LEFT)
  1460. add tempOffset, tempOffset, #2
  1461. #endif
  1462. .Ldtrmm_kernel_L1_M2_END:
  1463. .Ldtrmm_kernel_L1_M1_BEGIN:
  1464. tst counterI, #1 // counterI = counterI % 2
  1465. ble .Ldtrmm_kernel_L1_END
  1466. .Ldtrmm_kernel_L1_M1_20:
  1467. INIT1x1
  1468. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1469. mov pB, origPB
  1470. #else
  1471. mov pB, origPB
  1472. lsl temp, tempOffset, #3
  1473. add pB, pB, temp
  1474. lsl temp, tempOffset, #3
  1475. add pA, pA, temp
  1476. #endif
  1477. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1478. sub tempK, origK, tempOffset
  1479. #elif defined(LEFT)
  1480. add tempK, tempOffset, #1
  1481. #else
  1482. add tempK, tempOffset, #1
  1483. #endif
  1484. asr counterL , tempK, #3 // counterL = counterL / 8
  1485. cmp counterL , #0
  1486. ble .Ldtrmm_kernel_L1_M1_40
  1487. .Ldtrmm_kernel_L1_M1_22:
  1488. KERNEL1x1_SUB
  1489. KERNEL1x1_SUB
  1490. KERNEL1x1_SUB
  1491. KERNEL1x1_SUB
  1492. KERNEL1x1_SUB
  1493. KERNEL1x1_SUB
  1494. KERNEL1x1_SUB
  1495. KERNEL1x1_SUB
  1496. subs counterL, counterL, #1
  1497. bgt .Ldtrmm_kernel_L1_M1_22
  1498. .Ldtrmm_kernel_L1_M1_40:
  1499. ands counterL , tempK, #7 // counterL = counterL % 8
  1500. ble .Ldtrmm_kernel_L1_M1_100
  1501. .Ldtrmm_kernel_L1_M1_42:
  1502. KERNEL1x1_SUB
  1503. subs counterL, counterL, #1
  1504. bgt .Ldtrmm_kernel_L1_M1_42
  1505. .Ldtrmm_kernel_L1_M1_100:
  1506. SAVE1x1
  1507. .Ldtrmm_kernel_L1_END:
  1508. .Ldtrmm_kernel_L999:
  1509. mov x0, #0 // set return value
  1510. ldp d8, d9, [sp, #(0 * 16)]
  1511. ldp d10, d11, [sp, #(1 * 16)]
  1512. ldp d12, d13, [sp, #(2 * 16)]
  1513. ldp d14, d15, [sp, #(3 * 16)]
  1514. ldp d16, d17, [sp, #(4 * 16)]
  1515. ldp x18, x19, [sp, #(5 * 16)]
  1516. ldp x20, x21, [sp, #(6 * 16)]
  1517. ldp x22, x23, [sp, #(7 * 16)]
  1518. ldp x24, x25, [sp, #(8 * 16)]
  1519. ldp x26, x27, [sp, #(9 * 16)]
  1520. ldr x28, [sp, #(10 * 16)]
  1521. add sp, sp, #(11*16)
  1522. ret
  1523. EPILOGUE