You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_8x8.S 43 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define alpha0 s10
  49. #define alphaV0 v10.s[0]
  50. #define alpha1 s11
  51. #define alphaV1 v11.s[0]
  52. #define alpha2 s14
  53. #define alphaV2 v14.s[0]
  54. #define alpha3 s15
  55. #define alphaV3 v15.s[0]
  56. // 00 origM
  57. // 01 origN
  58. // 02 origK
  59. // 03 origPA
  60. // 04 origPB
  61. // 05 pC
  62. // 06 origLDC -> LDC
  63. // 07 offset
  64. // 08 counterL
  65. // 09 counterI
  66. // 10 counterJ
  67. // 11 pB
  68. // 12 pCRow0
  69. // 13 pCRow1
  70. // 14 pCRow2
  71. // 15 pA
  72. // 16 temp
  73. // 17
  74. // 18 must save
  75. // 19 must save
  76. // 20 must save
  77. // 21 must save
  78. // 22 must save
  79. // 23 must save
  80. // 24 must save
  81. // 25 must save
  82. // 26 must save
  83. // 27 must save
  84. // 28 must save
  85. // 29 frame
  86. // 30 link
  87. // 31 sp
  88. //v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
  89. //v01 pA0_4, pA0_5, pA0_6, pA0_7
  90. //v02 pA1_0, pA1_1, pA1_2, pA1_3
  91. //v03 pA1_4, pA1_5, pA1_6, pA1_7
  92. //v04 pB0_0, pB0_1, pB0_2, pB0_3
  93. //v05 pB0_4, pB0_5, pB0_6, pB0_7
  94. //v06 pB1_0, pB1_1, pB1_2, pB1_3
  95. //v07 pB1_4, pB1_5, pB1_6, pB1_7
  96. //v08 must save
  97. //v09 must save
  98. //v10 must save ALPHA0
  99. //v11 must save ALPHA1
  100. //v12 must save
  101. //v13 must save
  102. //v14 must save ALPHA2
  103. //v15 must save ALPHA3
  104. //v16 must save C00, C01, C02, C03
  105. //v17 must save C04, C05, C06, C07
  106. //v18 C08, C09, C10, C11
  107. //v19 C12, C13, C14, C15
  108. //v20 C16, C17, C18, C19
  109. //v21 C20, C21, C22, C23
  110. //v22 C24, C25, C26, C27
  111. //v23 C28, C29, C30, C31
  112. //v24 C32, C33, C34, C35
  113. //v25 C36, C37, C38, C39
  114. //v26 C40, C41, C42, C43
  115. //v27 C44, C45, C46, C47
  116. //v28 C48, C49, C50, C51
  117. //v29 C52, C53, C54, C55
  118. //v30 C56, C57, C58, C59
  119. //v31 C60, C61, C62, C63
  120. /*******************************************************************************
  121. * Macro definitions
  122. *******************************************************************************/
  123. .macro INIT8x8
  124. fmov s16, wzr
  125. fmov s17, wzr
  126. fmov s18, s16
  127. fmov s19, s17
  128. fmov s20, wzr
  129. fmov s21, s16
  130. fmov s22, s17
  131. fmov s23, s18
  132. fmov s24, wzr
  133. fmov s25, s16
  134. fmov s26, s17
  135. fmov s27, s18
  136. fmov s28, wzr
  137. fmov s29, s16
  138. fmov s30, s17
  139. fmov s31, s18
  140. .endm
  141. .macro KERNEL8x8_I
  142. ld1 {v4.4s}, [pB]
  143. add pB, pB, #16
  144. ld1 {v5.4s}, [pB]
  145. add pB, pB, #16
  146. ld1 {v0.4s}, [pA]
  147. add pA, pA, #16
  148. ld1 {v1.4s}, [pA]
  149. add pA, pA, #16
  150. fmul v16.4s, v0.4s, v4.s[0]
  151. fmul v17.4s, v1.4s, v4.s[0]
  152. fmul v18.4s, v0.4s, v4.s[1]
  153. fmul v19.4s, v1.4s, v4.s[1]
  154. fmul v20.4s, v0.4s, v4.s[2]
  155. fmul v21.4s, v1.4s, v4.s[2]
  156. fmul v22.4s, v0.4s, v4.s[3]
  157. fmul v23.4s, v1.4s, v4.s[3]
  158. fmul v24.4s, v0.4s, v5.s[0]
  159. fmul v25.4s, v1.4s, v5.s[0]
  160. fmul v26.4s, v0.4s, v5.s[1]
  161. fmul v27.4s, v1.4s, v5.s[1]
  162. fmul v28.4s, v0.4s, v5.s[2]
  163. fmul v29.4s, v1.4s, v5.s[2]
  164. fmul v30.4s, v0.4s, v5.s[3]
  165. fmul v31.4s, v1.4s, v5.s[3]
  166. ld1 {v6.4s}, [pB]
  167. add pB, pB, #16
  168. ld1 {v7.4s}, [pB]
  169. add pB, pB, #16
  170. ld1 {v2.4s}, [pA]
  171. add pA, pA, #16
  172. ld1 {v3.4s}, [pA]
  173. add pA, pA, #16
  174. .endm
  175. .macro KERNEL8x8_M1
  176. fmla v16.4s, v0.4s, v4.s[0]
  177. fmla v17.4s, v1.4s, v4.s[0]
  178. fmla v18.4s, v0.4s, v4.s[1]
  179. fmla v19.4s, v1.4s, v4.s[1]
  180. fmla v20.4s, v0.4s, v4.s[2]
  181. fmla v21.4s, v1.4s, v4.s[2]
  182. fmla v22.4s, v0.4s, v4.s[3]
  183. fmla v23.4s, v1.4s, v4.s[3]
  184. fmla v24.4s, v0.4s, v5.s[0]
  185. fmla v25.4s, v1.4s, v5.s[0]
  186. fmla v26.4s, v0.4s, v5.s[1]
  187. fmla v27.4s, v1.4s, v5.s[1]
  188. fmla v28.4s, v0.4s, v5.s[2]
  189. fmla v29.4s, v1.4s, v5.s[2]
  190. fmla v30.4s, v0.4s, v5.s[3]
  191. fmla v31.4s, v1.4s, v5.s[3]
  192. ld1 {v6.4s}, [pB]
  193. add pB, pB, #16
  194. ld1 {v7.4s}, [pB]
  195. add pB, pB, #16
  196. ld1 {v2.4s}, [pA]
  197. add pA, pA, #16
  198. ld1 {v3.4s}, [pA]
  199. add pA, pA, #16
  200. .endm
  201. .macro KERNEL8x8_M2
  202. fmla v16.4s, v2.4s, v6.s[0]
  203. fmla v17.4s, v3.4s, v6.s[0]
  204. fmla v18.4s, v2.4s, v6.s[1]
  205. fmla v19.4s, v3.4s, v6.s[1]
  206. fmla v20.4s, v2.4s, v6.s[2]
  207. fmla v21.4s, v3.4s, v6.s[2]
  208. fmla v22.4s, v2.4s, v6.s[3]
  209. fmla v23.4s, v3.4s, v6.s[3]
  210. fmla v24.4s, v2.4s, v7.s[0]
  211. fmla v25.4s, v3.4s, v7.s[0]
  212. fmla v26.4s, v2.4s, v7.s[1]
  213. fmla v27.4s, v3.4s, v7.s[1]
  214. fmla v28.4s, v2.4s, v7.s[2]
  215. fmla v29.4s, v3.4s, v7.s[2]
  216. fmla v30.4s, v2.4s, v7.s[3]
  217. fmla v31.4s, v3.4s, v7.s[3]
  218. ld1 {v4.4s}, [pB]
  219. add pB, pB, #16
  220. ld1 {v5.4s}, [pB]
  221. add pB, pB, #16
  222. ld1 {v0.4s}, [pA]
  223. add pA, pA, #16
  224. ld1 {v1.4s}, [pA]
  225. add pA, pA, #16
  226. .endm
  227. .macro KERNEL8x8_E
  228. fmla v16.4s, v2.4s, v6.s[0]
  229. fmla v17.4s, v3.4s, v6.s[0]
  230. fmla v18.4s, v2.4s, v6.s[1]
  231. fmla v19.4s, v3.4s, v6.s[1]
  232. fmla v20.4s, v2.4s, v6.s[2]
  233. fmla v21.4s, v3.4s, v6.s[2]
  234. fmla v22.4s, v2.4s, v6.s[3]
  235. fmla v23.4s, v3.4s, v6.s[3]
  236. fmla v24.4s, v2.4s, v7.s[0]
  237. fmla v25.4s, v3.4s, v7.s[0]
  238. fmla v26.4s, v2.4s, v7.s[1]
  239. fmla v27.4s, v3.4s, v7.s[1]
  240. fmla v28.4s, v2.4s, v7.s[2]
  241. fmla v29.4s, v3.4s, v7.s[2]
  242. fmla v30.4s, v2.4s, v7.s[3]
  243. fmla v31.4s, v3.4s, v7.s[3]
  244. .endm
  245. .macro KERNEL8x8_SUB
  246. ld1 {v4.4s}, [pB]
  247. add pB, pB, #16
  248. ld1 {v5.4s}, [pB]
  249. add pB, pB, #16
  250. ld1 {v0.4s}, [pA]
  251. add pA, pA, #16
  252. ld1 {v1.4s}, [pA]
  253. add pA, pA, #16
  254. fmla v16.4s, v0.4s, v4.s[0]
  255. fmla v17.4s, v1.4s, v4.s[0]
  256. fmla v18.4s, v0.4s, v4.s[1]
  257. fmla v19.4s, v1.4s, v4.s[1]
  258. fmla v20.4s, v0.4s, v4.s[2]
  259. fmla v21.4s, v1.4s, v4.s[2]
  260. fmla v22.4s, v0.4s, v4.s[3]
  261. fmla v23.4s, v1.4s, v4.s[3]
  262. fmla v24.4s, v0.4s, v5.s[0]
  263. fmla v25.4s, v1.4s, v5.s[0]
  264. fmla v26.4s, v0.4s, v5.s[1]
  265. fmla v27.4s, v1.4s, v5.s[1]
  266. fmla v28.4s, v0.4s, v5.s[2]
  267. fmla v29.4s, v1.4s, v5.s[2]
  268. fmla v30.4s, v0.4s, v5.s[3]
  269. fmla v31.4s, v1.4s, v5.s[3]
  270. .endm
  271. .macro SAVE8x8
  272. add pCRow1, pCRow0, LDC
  273. ld1 {v0.4s, v1.4s}, [pCRow0]
  274. fmla v0.4s, v16.4s, alphaV0
  275. fmla v1.4s, v17.4s, alphaV1
  276. st1 {v0.4s, v1.4s}, [pCRow0]
  277. add pCRow2, pCRow1, LDC
  278. ld1 {v2.4s, v3.4s}, [pCRow1]
  279. fmla v2.4s, v18.4s, alphaV2
  280. fmla v3.4s, v19.4s, alphaV3
  281. st1 {v2.4s, v3.4s}, [pCRow1]
  282. add pCRow1, pCRow2, LDC
  283. ld1 {v4.4s, v5.4s}, [pCRow2]
  284. fmla v4.4s, v20.4s, alphaV0
  285. fmla v5.4s, v21.4s, alphaV1
  286. st1 {v4.4s, v5.4s}, [pCRow2]
  287. add pCRow2, pCRow1, LDC
  288. ld1 {v6.4s, v7.4s}, [pCRow1]
  289. fmla v6.4s, v22.4s, alphaV2
  290. fmla v7.4s, v23.4s, alphaV3
  291. st1 {v6.4s, v7.4s}, [pCRow1]
  292. add pCRow1, pCRow2, LDC
  293. ld1 {v0.4s, v1.4s}, [pCRow2]
  294. fmla v0.4s, v24.4s, alphaV0
  295. fmla v1.4s, v25.4s, alphaV1
  296. st1 {v0.4s, v1.4s}, [pCRow2]
  297. add pCRow2, pCRow1, LDC
  298. ld1 {v2.4s, v3.4s}, [pCRow1]
  299. fmla v2.4s, v26.4s, alphaV2
  300. fmla v3.4s, v27.4s, alphaV3
  301. st1 {v2.4s, v3.4s}, [pCRow1]
  302. add pCRow1, pCRow2, LDC
  303. ld1 {v4.4s, v5.4s}, [pCRow2]
  304. fmla v4.4s, v28.4s, alphaV0
  305. fmla v5.4s, v29.4s, alphaV1
  306. st1 {v4.4s, v5.4s}, [pCRow2]
  307. ld1 {v6.4s, v7.4s}, [pCRow1]
  308. fmla v6.4s, v30.4s, alphaV2
  309. fmla v7.4s, v31.4s, alphaV3
  310. st1 {v6.4s, v7.4s}, [pCRow1]
  311. add pCRow0, pCRow0, #32
  312. .endm
  313. /******************************************************************************/
  314. .macro INIT4x8
  315. fmov s16, wzr
  316. fmov s18, wzr
  317. fmov s20, wzr
  318. fmov s22, s16
  319. fmov s24, wzr
  320. fmov s26, s16
  321. fmov s28, s18
  322. fmov s30, s20
  323. .endm
  324. .macro KERNEL4x8_I
  325. ld1 {v4.4s}, [pB]
  326. add pB, pB, #16
  327. ld1 {v5.4s}, [pB]
  328. add pB, pB, #16
  329. ld1 {v0.4s}, [pA]
  330. add pA, pA, #16
  331. fmul v16.4s, v0.4s, v4.s[0]
  332. fmul v18.4s, v0.4s, v4.s[1]
  333. fmul v20.4s, v0.4s, v4.s[2]
  334. fmul v22.4s, v0.4s, v4.s[3]
  335. fmul v24.4s, v0.4s, v5.s[0]
  336. fmul v26.4s, v0.4s, v5.s[1]
  337. fmul v28.4s, v0.4s, v5.s[2]
  338. fmul v30.4s, v0.4s, v5.s[3]
  339. ld1 {v6.4s}, [pB]
  340. add pB, pB, #16
  341. ld1 {v7.4s}, [pB]
  342. add pB, pB, #16
  343. ld1 {v2.4s}, [pA]
  344. add pA, pA, #16
  345. .endm
  346. .macro KERNEL4x8_M1
  347. fmla v16.4s, v0.4s, v4.s[0]
  348. fmla v18.4s, v0.4s, v4.s[1]
  349. fmla v20.4s, v0.4s, v4.s[2]
  350. fmla v22.4s, v0.4s, v4.s[3]
  351. fmla v24.4s, v0.4s, v5.s[0]
  352. fmla v26.4s, v0.4s, v5.s[1]
  353. fmla v28.4s, v0.4s, v5.s[2]
  354. fmla v30.4s, v0.4s, v5.s[3]
  355. ld1 {v6.4s}, [pB]
  356. add pB, pB, #16
  357. ld1 {v7.4s}, [pB]
  358. add pB, pB, #16
  359. ld1 {v2.4s}, [pA]
  360. add pA, pA, #16
  361. .endm
  362. .macro KERNEL4x8_M2
  363. fmla v16.4s, v2.4s, v6.s[0]
  364. fmla v18.4s, v2.4s, v6.s[1]
  365. fmla v20.4s, v2.4s, v6.s[2]
  366. fmla v22.4s, v2.4s, v6.s[3]
  367. fmla v24.4s, v2.4s, v7.s[0]
  368. fmla v26.4s, v2.4s, v7.s[1]
  369. fmla v28.4s, v2.4s, v7.s[2]
  370. fmla v30.4s, v2.4s, v7.s[3]
  371. ld1 {v4.4s}, [pB]
  372. add pB, pB, #16
  373. ld1 {v5.4s}, [pB]
  374. add pB, pB, #16
  375. ld1 {v0.4s}, [pA]
  376. add pA, pA, #16
  377. .endm
  378. .macro KERNEL4x8_E
  379. fmla v16.4s, v2.4s, v6.s[0]
  380. fmla v18.4s, v2.4s, v6.s[1]
  381. fmla v20.4s, v2.4s, v6.s[2]
  382. fmla v22.4s, v2.4s, v6.s[3]
  383. fmla v24.4s, v2.4s, v7.s[0]
  384. fmla v26.4s, v2.4s, v7.s[1]
  385. fmla v28.4s, v2.4s, v7.s[2]
  386. fmla v30.4s, v2.4s, v7.s[3]
  387. .endm
  388. .macro KERNEL4x8_SUB
  389. ld1 {v4.4s}, [pB]
  390. add pB, pB, #16
  391. ld1 {v5.4s}, [pB]
  392. add pB, pB, #16
  393. ld1 {v0.4s}, [pA]
  394. add pA, pA, #16
  395. fmla v16.4s, v0.4s, v4.s[0]
  396. fmla v18.4s, v0.4s, v4.s[1]
  397. fmla v20.4s, v0.4s, v4.s[2]
  398. fmla v22.4s, v0.4s, v4.s[3]
  399. fmla v24.4s, v0.4s, v5.s[0]
  400. fmla v26.4s, v0.4s, v5.s[1]
  401. fmla v28.4s, v0.4s, v5.s[2]
  402. fmla v30.4s, v0.4s, v5.s[3]
  403. .endm
  404. .macro SAVE4x8
  405. add pCRow1, pCRow0, LDC
  406. ld1 {v0.4s}, [pCRow0]
  407. fmla v0.4s, v16.4s, alphaV0
  408. st1 {v0.4s}, [pCRow0]
  409. add pCRow2, pCRow1, LDC
  410. ld1 {v2.4s}, [pCRow1]
  411. fmla v2.4s, v18.4s, alphaV2
  412. st1 {v2.4s}, [pCRow1]
  413. add pCRow1, pCRow2, LDC
  414. ld1 {v4.4s}, [pCRow2]
  415. fmla v4.4s, v20.4s, alphaV0
  416. st1 {v4.4s}, [pCRow2]
  417. add pCRow2, pCRow1, LDC
  418. ld1 {v6.4s}, [pCRow1]
  419. fmla v6.4s, v22.4s, alphaV2
  420. st1 {v6.4s}, [pCRow1]
  421. add pCRow1, pCRow2, LDC
  422. ld1 {v0.4s}, [pCRow2]
  423. fmla v0.4s, v24.4s, alphaV0
  424. st1 {v0.4s}, [pCRow2]
  425. add pCRow2, pCRow1, LDC
  426. ld1 {v2.4s}, [pCRow1]
  427. fmla v2.4s, v26.4s, alphaV2
  428. st1 {v2.4s}, [pCRow1]
  429. add pCRow1, pCRow2, LDC
  430. ld1 {v4.4s}, [pCRow2]
  431. fmla v4.4s, v28.4s, alphaV0
  432. st1 {v4.4s}, [pCRow2]
  433. ld1 {v6.4s}, [pCRow1]
  434. fmla v6.4s, v30.4s, alphaV2
  435. st1 {v6.4s}, [pCRow1]
  436. add pCRow0, pCRow0, #16
  437. .endm
  438. /******************************************************************************/
  439. .macro INIT2x8
  440. fmov s16, wzr
  441. fmov s18, wzr
  442. fmov s20, wzr
  443. fmov s22, s16
  444. fmov s24, wzr
  445. fmov s26, s16
  446. fmov s28, s18
  447. fmov s30, s20
  448. .endm
  449. .macro KERNEL2x8_SUB
  450. ld1 {v4.4s}, [pB]
  451. add pB, pB, #16
  452. ld1 {v5.4s}, [pB]
  453. add pB, pB, #16
  454. ld1 {v0.2s}, [pA]
  455. add pA, pA, #8
  456. fmla v16.2s, v0.2s, v4.s[0]
  457. fmla v18.2s, v0.2s, v4.s[1]
  458. fmla v20.2s, v0.2s, v4.s[2]
  459. fmla v22.2s, v0.2s, v4.s[3]
  460. fmla v24.2s, v0.2s, v5.s[0]
  461. fmla v26.2s, v0.2s, v5.s[1]
  462. fmla v28.2s, v0.2s, v5.s[2]
  463. fmla v30.2s, v0.2s, v5.s[3]
  464. .endm
  465. .macro SAVE2x8
  466. add pCRow1, pCRow0, LDC
  467. ld1 {v0.2s}, [pCRow0]
  468. fmla v0.2s, v16.2s, alphaV0
  469. st1 {v0.2s}, [pCRow0]
  470. add pCRow2, pCRow1, LDC
  471. ld1 {v2.2s}, [pCRow1]
  472. fmla v2.2s, v18.2s, alphaV2
  473. st1 {v2.2s}, [pCRow1]
  474. add pCRow1, pCRow2, LDC
  475. ld1 {v4.2s}, [pCRow2]
  476. fmla v4.2s, v20.2s, alphaV0
  477. st1 {v4.2s}, [pCRow2]
  478. add pCRow2, pCRow1, LDC
  479. ld1 {v6.2s}, [pCRow1]
  480. fmla v6.2s, v22.2s, alphaV2
  481. st1 {v6.2s}, [pCRow1]
  482. add pCRow1, pCRow2, LDC
  483. ld1 {v0.2s}, [pCRow2]
  484. fmla v0.2s, v24.2s, alphaV0
  485. st1 {v0.2s}, [pCRow2]
  486. add pCRow2, pCRow1, LDC
  487. ld1 {v2.2s}, [pCRow1]
  488. fmla v2.2s, v26.2s, alphaV2
  489. st1 {v2.2s}, [pCRow1]
  490. add pCRow1, pCRow2, LDC
  491. ld1 {v4.2s}, [pCRow2]
  492. fmla v4.2s, v28.2s, alphaV0
  493. st1 {v4.2s}, [pCRow2]
  494. ld1 {v6.2s}, [pCRow1]
  495. fmla v6.2s, v30.2s, alphaV2
  496. st1 {v6.2s}, [pCRow1]
  497. add pCRow0, pCRow0, #8
  498. .endm
  499. /******************************************************************************/
  500. .macro INIT1x8
  501. fmov s16, wzr
  502. fmov s18, wzr
  503. fmov s20, wzr
  504. fmov s22, s16
  505. fmov s24, wzr
  506. fmov s26, s16
  507. fmov s28, s18
  508. fmov s30, s20
  509. .endm
  510. .macro KERNEL1x8_SUB
  511. ld1 {v4.4s}, [pB]
  512. add pB, pB, #16
  513. ld1 {v5.4s}, [pB]
  514. add pB, pB, #16
  515. ldr s0, [pA]
  516. add pA, pA, #4
  517. fmla s16, s0, v4.s[0]
  518. fmla s18, s0, v4.s[1]
  519. fmla s20, s0, v4.s[2]
  520. fmla s22, s0, v4.s[3]
  521. fmla s24, s0, v5.s[0]
  522. fmla s26, s0, v5.s[1]
  523. fmla s28, s0, v5.s[2]
  524. fmla s30, s0, v5.s[3]
  525. .endm
  526. .macro SAVE1x8
  527. add pCRow1, pCRow0, LDC
  528. ldr s0, [pCRow0]
  529. fmla s0, s16, alphaV0
  530. str s0, [pCRow0]
  531. add pCRow2, pCRow1, LDC
  532. ldr s2, [pCRow1]
  533. fmla s2, s18, alphaV2
  534. str s2, [pCRow1]
  535. add pCRow1, pCRow2, LDC
  536. ldr s4, [pCRow2]
  537. fmla s4, s20, alphaV0
  538. str s4, [pCRow2]
  539. add pCRow2, pCRow1, LDC
  540. ldr s6, [pCRow1]
  541. fmla s6, s22, alphaV2
  542. str s6, [pCRow1]
  543. add pCRow1, pCRow2, LDC
  544. ldr s0, [pCRow2]
  545. fmla s0, s24, alphaV0
  546. str s0, [pCRow2]
  547. add pCRow2, pCRow1, LDC
  548. ldr s2, [pCRow1]
  549. fmla s2, s26, alphaV2
  550. str s2, [pCRow1]
  551. add pCRow1, pCRow2, LDC
  552. ldr s4, [pCRow2]
  553. fmla s4, s28, alphaV0
  554. str s4, [pCRow2]
  555. ldr s6, [pCRow1]
  556. fmla s6, s30, alphaV2
  557. str s6, [pCRow1]
  558. add pCRow0, pCRow0, #4
  559. .endm
  560. /******************************************************************************/
  561. .macro INIT8x4
  562. fmov s16, wzr
  563. fmov s17, wzr
  564. fmov s20, wzr
  565. fmov s21, s16
  566. fmov s24, wzr
  567. fmov s25, s16
  568. fmov s28, wzr
  569. fmov s29, s16
  570. .endm
  571. .macro KERNEL8x4_I
  572. ld1 {v8.2s, v9.2s}, [pB]
  573. add pB, pB, #16
  574. ld1 {v0.4s}, [pA]
  575. add pA, pA, #16
  576. ld1 {v1.4s}, [pA]
  577. add pA, pA, #16
  578. fmul v16.4s, v0.4s, v8.s[0]
  579. fmul v17.4s, v1.4s, v8.s[0]
  580. fmul v20.4s, v0.4s, v8.s[1]
  581. fmul v21.4s, v1.4s, v8.s[1]
  582. fmul v24.4s, v0.4s, v9.s[0]
  583. fmul v25.4s, v1.4s, v9.s[0]
  584. fmul v28.4s, v0.4s, v9.s[1]
  585. fmul v29.4s, v1.4s, v9.s[1]
  586. ld1 {v12.2s, v13.2s}, [pB]
  587. add pB, pB, #16
  588. ld1 {v4.4s}, [pA]
  589. add pA, pA, #16
  590. ld1 {v5.4s}, [pA]
  591. add pA, pA, #16
  592. .endm
  593. .macro KERNEL8x4_M1
  594. fmla v16.4s, v0.4s, v8.s[0]
  595. fmla v17.4s, v1.4s, v8.s[0]
  596. fmla v20.4s, v0.4s, v8.s[1]
  597. fmla v21.4s, v1.4s, v8.s[1]
  598. fmla v24.4s, v0.4s, v9.s[0]
  599. fmla v25.4s, v1.4s, v9.s[0]
  600. fmla v28.4s, v0.4s, v9.s[1]
  601. fmla v29.4s, v1.4s, v9.s[1]
  602. ld1 {v12.2s, v13.2s}, [pB]
  603. add pB, pB, #16
  604. ld1 {v4.4s}, [pA]
  605. add pA, pA, #16
  606. ld1 {v5.4s}, [pA]
  607. add pA, pA, #16
  608. .endm
  609. .macro KERNEL8x4_M2
  610. fmla v16.4s, v4.4s, v12.s[0]
  611. fmla v17.4s, v5.4s, v12.s[0]
  612. fmla v20.4s, v4.4s, v12.s[1]
  613. fmla v21.4s, v5.4s, v12.s[1]
  614. fmla v24.4s, v4.4s, v13.s[0]
  615. fmla v25.4s, v5.4s, v13.s[0]
  616. fmla v28.4s, v4.4s, v13.s[1]
  617. fmla v29.4s, v5.4s, v13.s[1]
  618. ld1 {v8.2s, v9.2s}, [pB]
  619. add pB, pB, #16
  620. ld1 {v0.4s}, [pA]
  621. add pA, pA, #16
  622. ld1 {v1.4s}, [pA]
  623. add pA, pA, #16
  624. .endm
  625. .macro KERNEL8x4_E
  626. fmla v16.4s, v4.4s, v12.s[0]
  627. fmla v17.4s, v5.4s, v12.s[0]
  628. fmla v20.4s, v4.4s, v12.s[1]
  629. fmla v21.4s, v5.4s, v12.s[1]
  630. fmla v24.4s, v4.4s, v13.s[0]
  631. fmla v25.4s, v5.4s, v13.s[0]
  632. fmla v28.4s, v4.4s, v13.s[1]
  633. fmla v29.4s, v5.4s, v13.s[1]
  634. .endm
  635. .macro KERNEL8x4_SUB
  636. ld1 {v8.2s, v9.2s}, [pB]
  637. add pB, pB, #16
  638. ld1 {v0.4s}, [pA]
  639. add pA, pA, #16
  640. ld1 {v1.4s}, [pA]
  641. add pA, pA, #16
  642. fmla v16.4s, v0.4s, v8.s[0]
  643. fmla v17.4s, v1.4s, v8.s[0]
  644. fmla v20.4s, v0.4s, v8.s[1]
  645. fmla v21.4s, v1.4s, v8.s[1]
  646. fmla v24.4s, v0.4s, v9.s[0]
  647. fmla v25.4s, v1.4s, v9.s[0]
  648. fmla v28.4s, v0.4s, v9.s[1]
  649. fmla v29.4s, v1.4s, v9.s[1]
  650. .endm
  651. .macro SAVE8x4
  652. add pCRow1, pCRow0, LDC
  653. ld1 {v0.4s, v1.4s}, [pCRow0]
  654. fmla v0.4s, v16.4s, alphaV0
  655. fmla v1.4s, v17.4s, alphaV1
  656. st1 {v0.4s, v1.4s}, [pCRow0]
  657. add pCRow2, pCRow1, LDC
  658. ld1 {v4.4s, v5.4s}, [pCRow1]
  659. fmla v4.4s, v20.4s, alphaV0
  660. fmla v5.4s, v21.4s, alphaV1
  661. st1 {v4.4s, v5.4s}, [pCRow1]
  662. add pCRow1, pCRow2, LDC
  663. ld1 {v0.4s, v1.4s}, [pCRow2]
  664. fmla v0.4s, v24.4s, alphaV0
  665. fmla v1.4s, v25.4s, alphaV1
  666. st1 {v0.4s, v1.4s}, [pCRow2]
  667. ld1 {v4.4s, v5.4s}, [pCRow1]
  668. fmla v4.4s, v28.4s, alphaV0
  669. fmla v5.4s, v29.4s, alphaV1
  670. st1 {v4.4s, v5.4s}, [pCRow1]
  671. add pCRow0, pCRow0, #32
  672. .endm
  673. /******************************************************************************/
  674. .macro INIT4x4
  675. fmov s16, wzr
  676. fmov s17, s16
  677. fmov s20, s17
  678. fmov s21, s16
  679. fmov s24, s17
  680. fmov s25, s16
  681. fmov s28, s17
  682. fmov s29, s16
  683. .endm
  684. .macro KERNEL4x4_I
  685. ld1 {v8.2s, v9.2s}, [pB]
  686. add pB, pB, #16
  687. ld1 {v0.2s, v1.2s}, [pA]
  688. add pA, pA, #16
  689. fmul v16.2s, v0.2s, v8.s[0]
  690. fmul v29.2s, v1.2s, v9.s[1]
  691. fmul v20.2s, v0.2s, v8.s[1]
  692. fmul v25.2s, v1.2s, v9.s[0]
  693. fmul v24.2s, v0.2s, v9.s[0]
  694. fmul v21.2s, v1.2s, v8.s[1]
  695. fmul v28.2s, v0.2s, v9.s[1]
  696. fmul v17.2s, v1.2s, v8.s[0]
  697. ld1 {v12.2s, v13.2s}, [pB]
  698. add pB, pB, #16
  699. ld1 {v4.2s, v5.2s}, [pA]
  700. add pA, pA, #16
  701. .endm
  702. .macro KERNEL4x4_M1
  703. fmla v16.2s, v0.2s, v8.s[0]
  704. fmla v29.2s, v1.2s, v9.s[1]
  705. ld1 {v12.2s, v13.2s}, [pB] // For next round
  706. add pB, pB, #16
  707. fmla v20.2s, v0.2s, v8.s[1]
  708. fmla v25.2s, v1.2s, v9.s[0]
  709. ld1 {v4.2s, v5.2s}, [pA] // For next round
  710. add pA, pA, #16
  711. fmla v24.2s, v0.2s, v9.s[0]
  712. fmla v21.2s, v1.2s, v8.s[1]
  713. prfm PLDL1KEEP, [pB, #512]
  714. fmla v28.2s, v0.2s, v9.s[1]
  715. fmla v17.2s, v1.2s, v8.s[0]
  716. .endm
  717. .macro KERNEL4x4_M2
  718. fmla v16.2s, v4.2s, v12.s[0]
  719. fmla v29.2s, v5.2s, v13.s[1]
  720. ld1 {v8.2s, v9.2s}, [pB] // For next round
  721. add pB, pB, #16
  722. fmla v20.2s, v4.2s, v12.s[1]
  723. fmla v25.2s, v5.2s, v13.s[0]
  724. ld1 {v0.2s, v1.2s}, [pA] // For next round
  725. add pA, pA, #16
  726. fmla v24.2s, v4.2s, v13.s[0]
  727. fmla v21.2s, v5.2s, v12.s[1]
  728. prfm PLDL1KEEP, [pA, #512]
  729. fmla v28.2s, v4.2s, v13.s[1]
  730. fmla v17.2s, v5.2s, v12.s[0]
  731. .endm
  732. .macro KERNEL4x4_E
  733. fmla v16.2s, v4.2s, v12.s[0]
  734. fmla v29.2s, v5.2s, v13.s[1]
  735. fmla v20.2s, v4.2s, v12.s[1]
  736. fmla v25.2s, v5.2s, v13.s[0]
  737. fmla v24.2s, v4.2s, v13.s[0]
  738. fmla v21.2s, v5.2s, v12.s[1]
  739. fmla v28.2s, v4.2s, v13.s[1]
  740. fmla v17.2s, v5.2s, v12.s[0]
  741. .endm
  742. .macro KERNEL4x4_SUB
  743. ld1 {v8.2s, v9.2s}, [pB]
  744. add pB, pB, #16
  745. ld1 {v0.2s, v1.2s}, [pA]
  746. add pA, pA, #16
  747. fmla v16.2s, v0.2s, v8.s[0]
  748. fmla v29.2s, v1.2s, v9.s[1]
  749. fmla v20.2s, v0.2s, v8.s[1]
  750. fmla v25.2s, v1.2s, v9.s[0]
  751. fmla v24.2s, v0.2s, v9.s[0]
  752. fmla v21.2s, v1.2s, v8.s[1]
  753. fmla v28.2s, v0.2s, v9.s[1]
  754. fmla v17.2s, v1.2s, v8.s[0]
  755. .endm
  756. .macro SAVE4x4
  757. ld1 {v8.2s, v9.2s}, [pCRow0]
  758. fmla v8.2s, v16.2s, alphaV0
  759. fmla v9.2s, v17.2s, alphaV1
  760. st1 {v8.2s, v9.2s}, [pCRow0]
  761. add pCRow1, pCRow0, LDC
  762. ld1 {v12.2s, v13.2s}, [pCRow1]
  763. fmla v12.2s, v20.2s, alphaV2
  764. fmla v13.2s, v21.2s, alphaV3
  765. st1 {v12.2s, v13.2s}, [pCRow1]
  766. add pCRow2, pCRow1, LDC
  767. ld1 {v8.2s, v9.2s}, [pCRow2]
  768. fmla v8.2s, v24.2s, alphaV0
  769. fmla v9.2s, v25.2s, alphaV1
  770. st1 {v8.2s, v9.2s}, [pCRow2]
  771. add pCRow1, pCRow2, LDC
  772. ld1 {v12.2s, v13.2s}, [pCRow1]
  773. fmla v12.2s, v28.2s, alphaV2
  774. fmla v13.2s, v29.2s, alphaV3
  775. st1 {v12.2s, v13.2s}, [pCRow1]
  776. add pCRow0, pCRow0, #16
  777. .endm
  778. /******************************************************************************/
  779. .macro INIT2x4
  780. fmov s16, wzr
  781. fmov s20, s16
  782. fmov s24, s20
  783. fmov s28, s16
  784. .endm
  785. .macro KERNEL2x4_SUB
  786. ld1 {v8.2s, v9.2s}, [pB]
  787. add pB, pB, #16
  788. ld1 {v0.2s}, [pA]
  789. add pA, pA, #8
  790. fmla v16.2s, v0.2s, v8.s[0]
  791. fmla v20.2s, v0.2s, v8.s[1]
  792. fmla v24.2s, v0.2s, v9.s[0]
  793. fmla v28.2s, v0.2s, v9.s[1]
  794. .endm
  795. .macro SAVE2x4
  796. ld1 {v8.2s}, [pCRow0]
  797. fmla v8.2s, v16.2s, alphaV0
  798. st1 {v8.2s}, [pCRow0]
  799. add pCRow1, pCRow0, LDC
  800. ld1 {v12.2s}, [pCRow1]
  801. fmla v12.2s, v20.2s, alphaV1
  802. st1 {v12.2s}, [pCRow1]
  803. add pCRow2, pCRow1, LDC
  804. ld1 {v8.2s}, [pCRow2]
  805. fmla v8.2s, v24.2s, alphaV2
  806. st1 {v8.2s}, [pCRow2]
  807. add pCRow1, pCRow2, LDC
  808. ld1 {v12.2s}, [pCRow1]
  809. fmla v12.2s, v28.2s, alphaV3
  810. st1 {v12.2s}, [pCRow1]
  811. add pCRow0, pCRow0, #8
  812. .endm
  813. /******************************************************************************/
  814. .macro INIT1x4
  815. fmov s16, wzr
  816. fmov s20, s16
  817. .endm
  818. .macro KERNEL1x4_SUB
  819. ldr s0, [pA]
  820. add pA, pA, #4
  821. ld1 {v8.2s, v9.2s}, [pB]
  822. add pB, pB, #16
  823. fmla v16.2s, v8.2s, v0.s[0]
  824. fmla v20.2s, v9.2s, v0.s[0]
  825. .endm
  826. .macro SAVE1x4
  827. add pCRow1, pCRow0, LDC
  828. ld1 {v8.s}[0], [pCRow0]
  829. ld1 {v8.s}[1], [pCRow1]
  830. fmla v8.2s, v16.2s, alphaV0
  831. st1 {v8.s}[0], [pCRow0]
  832. st1 {v8.s}[1], [pCRow1]
  833. add pCRow2, pCRow1, LDC
  834. add pCRow1, pCRow2, LDC
  835. ld1 {v12.s}[0], [pCRow2]
  836. ld1 {v12.s}[1], [pCRow1]
  837. fmla v12.2s, v20.2s, alphaV1
  838. st1 {v12.s}[0], [pCRow2]
  839. st1 {v12.s}[1], [pCRow1]
  840. add pCRow0, pCRow0, #4
  841. .endm
  842. /******************************************************************************/
  843. .macro INIT8x2
  844. fmov s16, wzr
  845. fmov s17, s16
  846. fmov s20, s17
  847. fmov s21, s16
  848. .endm
  849. .macro KERNEL8x2_SUB
  850. ld1 {v8.2s}, [pB]
  851. add pB, pB, #8
  852. ld1 {v0.4s}, [pA]
  853. add pA, pA, #16
  854. ld1 {v1.4s}, [pA]
  855. add pA, pA, #16
  856. fmla v16.4s, v0.4s, v8.s[0]
  857. fmla v17.4s, v1.4s, v8.s[0]
  858. fmla v20.4s, v0.4s, v8.s[1]
  859. fmla v21.4s, v1.4s, v8.s[1]
  860. .endm
  861. .macro SAVE8x2
  862. add pCRow1, pCRow0, LDC
  863. ld1 {v0.4s, v1.4s}, [pCRow0]
  864. fmla v0.4s, v16.4s, alphaV0
  865. fmla v1.4s, v17.4s, alphaV1
  866. st1 {v0.4s, v1.4s}, [pCRow0]
  867. add pCRow2, pCRow1, LDC
  868. ld1 {v4.4s, v5.4s}, [pCRow1]
  869. fmla v4.4s, v20.4s, alphaV0
  870. fmla v5.4s, v21.4s, alphaV1
  871. st1 {v4.4s, v5.4s}, [pCRow1]
  872. add pCRow0, pCRow0, #32
  873. .endm
  874. /******************************************************************************/
  875. .macro INIT4x2
  876. fmov s16, wzr
  877. fmov s17, s16
  878. fmov s20, s17
  879. fmov s21, s16
  880. .endm
  881. .macro KERNEL4x2_SUB
  882. ld1 {v8.2s}, [pB]
  883. add pB, pB, #8
  884. ld1 {v0.2s, v1.2s}, [pA]
  885. add pA, pA, #16
  886. fmla v16.2s, v0.2s, v8.s[0]
  887. fmla v17.2s, v1.2s, v8.s[0]
  888. fmla v20.2s, v0.2s, v8.s[1]
  889. fmla v21.2s, v1.2s, v8.s[1]
  890. .endm
  891. .macro SAVE4x2
  892. ld1 {v8.2s, v9.2s}, [pCRow0]
  893. fmla v8.2s, v16.2s, alphaV0
  894. fmla v9.2s, v17.2s, alphaV1
  895. st1 {v8.2s, v9.2s}, [pCRow0]
  896. add pCRow1, pCRow0, LDC
  897. ld1 {v12.2s, v13.2s}, [pCRow1]
  898. fmla v12.2s, v20.2s, alphaV2
  899. fmla v13.2s, v21.2s, alphaV3
  900. st1 {v12.2s, v13.2s}, [pCRow1]
  901. add pCRow0, pCRow0, #16
  902. .endm
  903. /******************************************************************************/
  904. .macro INIT2x2
  905. fmov s16, wzr
  906. fmov s20, s16
  907. .endm
  908. .macro KERNEL2x2_SUB
  909. ld1 {v8.2s}, [pB]
  910. add pB, pB, #8
  911. ld1 {v0.2s}, [pA]
  912. add pA, pA, #8
  913. fmla v16.2s, v0.2s, v8.s[0]
  914. fmla v20.2s, v0.2s, v8.s[1]
  915. .endm
  916. .macro SAVE2x2
  917. ld1 {v8.2s}, [pCRow0]
  918. fmla v8.2s, v16.2s, alphaV0
  919. st1 {v8.2s}, [pCRow0]
  920. add pCRow1 , pCRow0, LDC
  921. ld1 {v12.2s}, [pCRow1]
  922. fmla v12.2s, v20.2s, alphaV1
  923. st1 {v12.2s}, [pCRow1]
  924. add pCRow0, pCRow0, #8
  925. .endm
  926. /******************************************************************************/
  927. .macro INIT1x2
  928. fmov s16, wzr
  929. .endm
  930. .macro KERNEL1x2_SUB
  931. ld1 {v8.2s} , [pB]
  932. add pB , pB, #8
  933. ldr s0 , [pA]
  934. add pA, pA, #4
  935. fmla v16.2s, v8.2s, v0.s[0]
  936. .endm
  937. .macro SAVE1x2
  938. add pCRow1 , pCRow0, LDC
  939. ld1 {v8.s}[0], [pCRow0]
  940. ld1 {v8.s}[1], [pCRow1]
  941. fmla v8.2s, v16.2s, alphaV0
  942. st1 {v8.s}[0], [pCRow0]
  943. st1 {v8.s}[1], [pCRow1]
  944. add pCRow0, pCRow0, #4
  945. .endm
  946. /******************************************************************************/
  947. .macro INIT8x1
  948. fmov s16, wzr
  949. fmov s17, wzr
  950. .endm
  951. .macro KERNEL8x1_SUB
  952. ldr s8, [pB]
  953. add pB , pB, #4
  954. ld1 {v0.4s}, [pA]
  955. add pA, pA, #16
  956. ld1 {v1.4s}, [pA]
  957. add pA, pA, #16
  958. fmla v16.4s, v0.4s, v8.s[0]
  959. fmla v17.4s, v1.4s, v8.s[0]
  960. .endm
  961. .macro SAVE8x1
  962. ld1 {v0.4s, v1.4s}, [pCRow0]
  963. fmla v0.4s, v16.4s, alphaV0
  964. fmla v1.4s, v17.4s, alphaV1
  965. st1 {v0.4s, v1.4s}, [pCRow0]
  966. add pCRow0, pCRow0, #32
  967. .endm
  968. /******************************************************************************/
  969. .macro INIT4x1
  970. fmov s16, wzr
  971. fmov s17, s16
  972. .endm
  973. .macro KERNEL4x1_SUB
  974. ldr s8, [pB]
  975. add pB , pB, #4
  976. ld1 {v0.2s, v1.2s}, [pA]
  977. add pA , pA, #16
  978. fmla v16.2s, v0.2s, v8.s[0]
  979. fmla v17.2s, v1.2s, v8.s[0]
  980. .endm
  981. .macro SAVE4x1
  982. ld1 {v8.2s, v9.2s}, [pCRow0]
  983. fmla v8.2s, v16.2s, alphaV0
  984. fmla v9.2s, v17.2s, alphaV1
  985. st1 {v8.2s, v9.2s}, [pCRow0]
  986. add pCRow0, pCRow0, #16
  987. .endm
  988. /******************************************************************************/
  989. .macro INIT2x1
  990. fmov s16, wzr
  991. .endm
  992. .macro KERNEL2x1_SUB
  993. ldr s8, [pB]
  994. add pB , pB, #4
  995. ld1 {v0.2s}, [pA]
  996. add pA , pA, #8
  997. fmla v16.2s, v0.2s, v8.s[0]
  998. .endm
  999. .macro SAVE2x1
  1000. ld1 {v8.2s}, [pCRow0]
  1001. fmla v8.2s, v16.2s, alphaV0
  1002. st1 {v8.2s}, [pCRow0]
  1003. add pCRow0, pCRow0, #8
  1004. .endm
  1005. /******************************************************************************/
  1006. .macro INIT1x1
  1007. fmov s16, wzr
  1008. .endm
  1009. .macro KERNEL1x1_SUB
  1010. ldr s8, [pB]
  1011. add pB , pB, #4
  1012. ldr s0, [pA]
  1013. add pA , pA, #4
  1014. fmadd s16, s0, s8, s16
  1015. .endm
  1016. .macro SAVE1x1
  1017. ldr s8, [pCRow0]
  1018. fmla s8, s16, alphaV0
  1019. str s8, [pCRow0]
  1020. add pCRow0, pCRow0, #4
  1021. .endm
  1022. /*******************************************************************************
  1023. * End of macro definitions
  1024. *******************************************************************************/
  1025. PROLOGUE
  1026. .Lsgemm_kernel_begin:
  1027. .align 5
  1028. add sp, sp, #-(11 * 16)
  1029. stp d8, d9, [sp, #(0 * 16)]
  1030. stp d10, d11, [sp, #(1 * 16)]
  1031. stp d12, d13, [sp, #(2 * 16)]
  1032. stp d14, d15, [sp, #(3 * 16)]
  1033. stp d16, d17, [sp, #(4 * 16)]
  1034. stp x18, x19, [sp, #(5 * 16)]
  1035. stp x20, x21, [sp, #(6 * 16)]
  1036. stp x22, x23, [sp, #(7 * 16)]
  1037. stp x24, x25, [sp, #(8 * 16)]
  1038. stp x26, x27, [sp, #(9 * 16)]
  1039. str x28, [sp, #(10 * 16)]
  1040. fmov alpha0, s0
  1041. fmov alpha1, s0
  1042. fmov alpha2, s0
  1043. fmov alpha3, s0
  1044. lsl LDC, LDC, #2 // ldc = ldc * 4
  1045. mov pB, origPB
  1046. mov counterJ, origN
  1047. asr counterJ, counterJ, #3 // J = J / 8
  1048. cmp counterJ, #0
  1049. ble .Lsgemm_kernel_L4_BEGIN
  1050. /******************************************************************************/
  1051. /******************************************************************************/
  1052. .Lsgemm_kernel_L8_BEGIN:
  1053. mov pCRow0, pC // pCRow0 = C
  1054. add pC, pC, LDC, lsl #3
  1055. mov pA, origPA // pA = start of A array
  1056. /******************************************************************************/
  1057. .Lsgemm_kernel_L8_M8_BEGIN:
  1058. mov counterI, origM
  1059. asr counterI, counterI, #3 // counterI = counterI / 8
  1060. cmp counterI, #0
  1061. ble .Lsgemm_kernel_L8_M4_BEGIN
  1062. .Lsgemm_kernel_L8_M8_20:
  1063. mov pB, origPB
  1064. asr counterL , origK, #1 // L = K / 2
  1065. cmp counterL , #2 // is there at least 4 to do?
  1066. blt .Lsgemm_kernel_L8_M8_32
  1067. KERNEL8x8_I // do one in the K
  1068. KERNEL8x8_M2 // do another in the K
  1069. subs counterL, counterL, #2
  1070. ble .Lsgemm_kernel_L8_M8_22a
  1071. .align 5
  1072. .Lsgemm_kernel_L8_M8_22:
  1073. KERNEL8x8_M1
  1074. KERNEL8x8_M2
  1075. subs counterL, counterL, #1
  1076. bgt .Lsgemm_kernel_L8_M8_22
  1077. .Lsgemm_kernel_L8_M8_22a:
  1078. KERNEL8x8_M1
  1079. KERNEL8x8_E
  1080. b .Lsgemm_kernel_L8_M8_44
  1081. .Lsgemm_kernel_L8_M8_32:
  1082. tst counterL, #1
  1083. ble .Lsgemm_kernel_L8_M8_40
  1084. KERNEL8x8_I
  1085. KERNEL8x8_E
  1086. b .Lsgemm_kernel_L8_M8_44
  1087. .Lsgemm_kernel_L8_M8_40:
  1088. INIT8x8
  1089. .Lsgemm_kernel_L8_M8_44:
  1090. ands counterL , origK, #1
  1091. ble .Lsgemm_kernel_L8_M8_100
  1092. .Lsgemm_kernel_L8_M8_46:
  1093. KERNEL8x8_SUB
  1094. .Lsgemm_kernel_L8_M8_100:
  1095. SAVE8x8
  1096. .Lsgemm_kernel_L8_M8_END:
  1097. subs counterI, counterI, #1
  1098. bne .Lsgemm_kernel_L8_M8_20
  1099. /******************************************************************************/
  1100. .Lsgemm_kernel_L8_M4_BEGIN:
  1101. mov counterI, origM
  1102. tst counterI , #7
  1103. ble .Lsgemm_kernel_L8_END
  1104. tst counterI, #4
  1105. ble .Lsgemm_kernel_L8_M2_BEGIN
  1106. .Lsgemm_kernel_L8_M4_20:
  1107. mov pB, origPB
  1108. asr counterL , origK, #1 // L = K / 2
  1109. cmp counterL , #2 // is there at least 4 to do?
  1110. blt .Lsgemm_kernel_L8_M4_32
  1111. KERNEL4x8_I // do one in the K
  1112. KERNEL4x8_M2 // do another in the K
  1113. subs counterL, counterL, #2
  1114. ble .Lsgemm_kernel_L8_M4_22a
  1115. .align 5
  1116. .Lsgemm_kernel_L8_M4_22:
  1117. KERNEL4x8_M1
  1118. KERNEL4x8_M2
  1119. subs counterL, counterL, #1
  1120. bgt .Lsgemm_kernel_L8_M4_22
  1121. .Lsgemm_kernel_L8_M4_22a:
  1122. KERNEL4x8_M1
  1123. KERNEL4x8_E
  1124. b .Lsgemm_kernel_L8_M4_44
  1125. .Lsgemm_kernel_L8_M4_32:
  1126. tst counterL, #1
  1127. ble .Lsgemm_kernel_L8_M4_40
  1128. KERNEL4x8_I
  1129. KERNEL4x8_E
  1130. b .Lsgemm_kernel_L8_M4_44
  1131. .Lsgemm_kernel_L8_M4_40:
  1132. INIT4x8
  1133. .Lsgemm_kernel_L8_M4_44:
  1134. ands counterL , origK, #1
  1135. ble .Lsgemm_kernel_L8_M4_100
  1136. .Lsgemm_kernel_L8_M4_46:
  1137. KERNEL4x8_SUB
  1138. .Lsgemm_kernel_L8_M4_100:
  1139. SAVE4x8
  1140. .Lsgemm_kernel_L8_M4_END:
  1141. /******************************************************************************/
  1142. .Lsgemm_kernel_L8_M2_BEGIN:
  1143. mov counterI, origM
  1144. tst counterI , #3
  1145. ble .Lsgemm_kernel_L8_END
  1146. tst counterI, #2 // counterI = counterI / 2
  1147. ble .Lsgemm_kernel_L8_M1_BEGIN
  1148. .Lsgemm_kernel_L8_M2_20:
  1149. INIT2x8
  1150. mov pB, origPB
  1151. asr counterL , origK, #3 // counterL = counterL / 8
  1152. cmp counterL , #0
  1153. ble .Lsgemm_kernel_L8_M2_40
  1154. .Lsgemm_kernel_L8_M2_22:
  1155. KERNEL2x8_SUB
  1156. KERNEL2x8_SUB
  1157. KERNEL2x8_SUB
  1158. KERNEL2x8_SUB
  1159. KERNEL2x8_SUB
  1160. KERNEL2x8_SUB
  1161. KERNEL2x8_SUB
  1162. KERNEL2x8_SUB
  1163. subs counterL, counterL, #1
  1164. bgt .Lsgemm_kernel_L8_M2_22
  1165. .Lsgemm_kernel_L8_M2_40:
  1166. ands counterL , origK, #7 // counterL = counterL % 8
  1167. ble .Lsgemm_kernel_L8_M2_100
  1168. .Lsgemm_kernel_L8_M2_42:
  1169. KERNEL2x8_SUB
  1170. subs counterL, counterL, #1
  1171. bgt .Lsgemm_kernel_L8_M2_42
  1172. .Lsgemm_kernel_L8_M2_100:
  1173. SAVE2x8
  1174. .Lsgemm_kernel_L8_M2_END:
  1175. /******************************************************************************/
  1176. .Lsgemm_kernel_L8_M1_BEGIN:
  1177. tst counterI, #1 // counterI = counterI % 2
  1178. ble .Lsgemm_kernel_L8_END
  1179. .Lsgemm_kernel_L8_M1_20:
  1180. INIT1x8
  1181. mov pB, origPB
  1182. asr counterL , origK, #3 // counterL = counterL / 8
  1183. cmp counterL , #0
  1184. ble .Lsgemm_kernel_L8_M1_40
  1185. .Lsgemm_kernel_L8_M1_22:
  1186. KERNEL1x8_SUB
  1187. KERNEL1x8_SUB
  1188. KERNEL1x8_SUB
  1189. KERNEL1x8_SUB
  1190. KERNEL1x8_SUB
  1191. KERNEL1x8_SUB
  1192. KERNEL1x8_SUB
  1193. KERNEL1x8_SUB
  1194. subs counterL, counterL, #1
  1195. bgt .Lsgemm_kernel_L8_M1_22
  1196. .Lsgemm_kernel_L8_M1_40:
  1197. ands counterL , origK, #7 // counterL = counterL % 8
  1198. ble .Lsgemm_kernel_L8_M1_100
  1199. .Lsgemm_kernel_L8_M1_42:
  1200. KERNEL1x8_SUB
  1201. subs counterL, counterL, #1
  1202. bgt .Lsgemm_kernel_L8_M1_42
  1203. .Lsgemm_kernel_L8_M1_100:
  1204. SAVE1x8
  1205. .Lsgemm_kernel_L8_END:
  1206. lsl temp, origK, #5 // B = B + K * 4 * 8
  1207. add origPB, origPB, temp
  1208. subs counterJ, counterJ , #1 // j--
  1209. bgt .Lsgemm_kernel_L8_BEGIN
  1210. /******************************************************************************/
  1211. /******************************************************************************/
  1212. .Lsgemm_kernel_L4_BEGIN:
  1213. mov counterJ , origN
  1214. tst counterJ , #7
  1215. ble .Lsgemm_kernel_L999
  1216. tst counterJ , #4
  1217. ble .Lsgemm_kernel_L2_BEGIN
  1218. mov pCRow0, pC // pCRow0 = pC
  1219. add pC,pC,LDC, lsl #2
  1220. mov pA, origPA // pA = A
  1221. /******************************************************************************/
  1222. .Lsgemm_kernel_L4_M8_BEGIN:
  1223. mov counterI, origM
  1224. asr counterI, counterI, #3 // counterI = counterI / 8
  1225. cmp counterI, #0
  1226. ble .Lsgemm_kernel_L4_M4_BEGIN
  1227. .Lsgemm_kernel_L4_M8_20:
  1228. mov pB, origPB
  1229. asr counterL , origK, #1 // L = K / 2
  1230. cmp counterL , #2 // is there at least 4 to do?
  1231. blt .Lsgemm_kernel_L4_M8_32
  1232. KERNEL8x4_I // do one in the K
  1233. KERNEL8x4_M2 // do another in the K
  1234. subs counterL, counterL, #2
  1235. ble .Lsgemm_kernel_L4_M8_22a
  1236. .align 5
  1237. .Lsgemm_kernel_L4_M8_22:
  1238. KERNEL8x4_M1
  1239. KERNEL8x4_M2
  1240. subs counterL, counterL, #1
  1241. bgt .Lsgemm_kernel_L4_M8_22
  1242. .Lsgemm_kernel_L4_M8_22a:
  1243. KERNEL8x4_M1
  1244. KERNEL8x4_E
  1245. b .Lsgemm_kernel_L4_M8_44
  1246. .Lsgemm_kernel_L4_M8_32:
  1247. tst counterL, #1
  1248. ble .Lsgemm_kernel_L4_M8_40
  1249. KERNEL8x4_I
  1250. KERNEL8x4_E
  1251. b .Lsgemm_kernel_L4_M8_44
  1252. .Lsgemm_kernel_L4_M8_40:
  1253. INIT8x4
  1254. .Lsgemm_kernel_L4_M8_44:
  1255. ands counterL , origK, #1
  1256. ble .Lsgemm_kernel_L4_M8_100
  1257. .Lsgemm_kernel_L4_M8_46:
  1258. KERNEL8x4_SUB
  1259. .Lsgemm_kernel_L4_M8_100:
  1260. SAVE8x4
  1261. .Lsgemm_kernel_L4_M8_END:
  1262. subs counterI, counterI, #1
  1263. bne .Lsgemm_kernel_L4_M8_20
  1264. /******************************************************************************/
  1265. .Lsgemm_kernel_L4_M4_BEGIN:
  1266. mov counterI, origM
  1267. tst counterI , #7
  1268. ble .Lsgemm_kernel_L4_END
  1269. tst counterI, #4
  1270. ble .Lsgemm_kernel_L4_M2_BEGIN
  1271. .Lsgemm_kernel_L4_M4_20:
  1272. mov pB, origPB
  1273. asr counterL , origK, #1 // L = K / 2
  1274. cmp counterL , #2 // is there at least 4 to do?
  1275. blt .Lsgemm_kernel_L4_M4_32
  1276. KERNEL4x4_I // do one in the K
  1277. KERNEL4x4_M2 // do another in the K
  1278. subs counterL, counterL, #2
  1279. ble .Lsgemm_kernel_L4_M4_22a
  1280. .align 5
  1281. .Lsgemm_kernel_L4_M4_22:
  1282. KERNEL4x4_M1
  1283. KERNEL4x4_M2
  1284. subs counterL, counterL, #1
  1285. bgt .Lsgemm_kernel_L4_M4_22
  1286. .Lsgemm_kernel_L4_M4_22a:
  1287. KERNEL4x4_M1
  1288. KERNEL4x4_E
  1289. b .Lsgemm_kernel_L4_M4_44
  1290. .Lsgemm_kernel_L4_M4_32:
  1291. tst counterL, #1
  1292. ble .Lsgemm_kernel_L4_M4_40
  1293. KERNEL4x4_I
  1294. KERNEL4x4_E
  1295. b .Lsgemm_kernel_L4_M4_44
  1296. .Lsgemm_kernel_L4_M4_40:
  1297. INIT4x4
  1298. .Lsgemm_kernel_L4_M4_44:
  1299. ands counterL , origK, #1
  1300. ble .Lsgemm_kernel_L4_M4_100
  1301. .Lsgemm_kernel_L4_M4_46:
  1302. KERNEL4x4_SUB
  1303. .Lsgemm_kernel_L4_M4_100:
  1304. SAVE4x4
  1305. .Lsgemm_kernel_L4_M4_END:
  1306. /******************************************************************************/
  1307. .Lsgemm_kernel_L4_M2_BEGIN:
  1308. mov counterI, origM
  1309. tst counterI , #3
  1310. ble .Lsgemm_kernel_L4_END
  1311. tst counterI, #2 // counterI = counterI / 2
  1312. ble .Lsgemm_kernel_L4_M1_BEGIN
  1313. .Lsgemm_kernel_L4_M2_20:
  1314. INIT2x4
  1315. mov pB, origPB
  1316. asr counterL , origK, #3 // counterL = counterL / 8
  1317. cmp counterL , #0
  1318. ble .Lsgemm_kernel_L4_M2_40
  1319. .Lsgemm_kernel_L4_M2_22:
  1320. KERNEL2x4_SUB
  1321. KERNEL2x4_SUB
  1322. KERNEL2x4_SUB
  1323. KERNEL2x4_SUB
  1324. KERNEL2x4_SUB
  1325. KERNEL2x4_SUB
  1326. KERNEL2x4_SUB
  1327. KERNEL2x4_SUB
  1328. subs counterL, counterL, #1
  1329. bgt .Lsgemm_kernel_L4_M2_22
  1330. .Lsgemm_kernel_L4_M2_40:
  1331. ands counterL , origK, #7 // counterL = counterL % 8
  1332. ble .Lsgemm_kernel_L4_M2_100
  1333. .Lsgemm_kernel_L4_M2_42:
  1334. KERNEL2x4_SUB
  1335. subs counterL, counterL, #1
  1336. bgt .Lsgemm_kernel_L4_M2_42
  1337. .Lsgemm_kernel_L4_M2_100:
  1338. SAVE2x4
  1339. .Lsgemm_kernel_L4_M2_END:
  1340. /******************************************************************************/
  1341. .Lsgemm_kernel_L4_M1_BEGIN:
  1342. tst counterI, #1 // counterI = counterI % 2
  1343. ble .Lsgemm_kernel_L4_END
  1344. .Lsgemm_kernel_L4_M1_20:
  1345. INIT1x4
  1346. mov pB, origPB
  1347. asr counterL , origK, #3 // counterL = counterL / 8
  1348. cmp counterL , #0
  1349. ble .Lsgemm_kernel_L4_M1_40
  1350. .Lsgemm_kernel_L4_M1_22:
  1351. KERNEL1x4_SUB
  1352. KERNEL1x4_SUB
  1353. KERNEL1x4_SUB
  1354. KERNEL1x4_SUB
  1355. KERNEL1x4_SUB
  1356. KERNEL1x4_SUB
  1357. KERNEL1x4_SUB
  1358. KERNEL1x4_SUB
  1359. subs counterL, counterL, #1
  1360. bgt .Lsgemm_kernel_L4_M1_22
  1361. .Lsgemm_kernel_L4_M1_40:
  1362. ands counterL , origK, #7 // counterL = counterL % 8
  1363. ble .Lsgemm_kernel_L4_M1_100
  1364. .Lsgemm_kernel_L4_M1_42:
  1365. KERNEL1x4_SUB
  1366. subs counterL, counterL, #1
  1367. bgt .Lsgemm_kernel_L4_M1_42
  1368. .Lsgemm_kernel_L4_M1_100:
  1369. SAVE1x4
  1370. .Lsgemm_kernel_L4_END:
  1371. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1372. /******************************************************************************/
  1373. /******************************************************************************/
  1374. .Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1375. mov counterJ , origN
  1376. tst counterJ , #3
  1377. ble .Lsgemm_kernel_L999
  1378. tst counterJ , #2
  1379. ble .Lsgemm_kernel_L1_BEGIN
  1380. mov pCRow0, pC // pCRow0 = pC
  1381. add pC,pC,LDC, lsl #1
  1382. mov pA, origPA // pA = A
  1383. /******************************************************************************/
  1384. .Lsgemm_kernel_L2_M8_BEGIN:
  1385. mov counterI, origM
  1386. asr counterI, counterI, #3 // counterI = counterI / 8
  1387. cmp counterI,#0
  1388. ble .Lsgemm_kernel_L2_M4_BEGIN
  1389. .Lsgemm_kernel_L2_M8_20:
  1390. INIT8x2
  1391. mov pB, origPB
  1392. asr counterL , origK, #3 // counterL = counterL / 8
  1393. cmp counterL,#0
  1394. ble .Lsgemm_kernel_L2_M8_40
  1395. .align 5
  1396. .Lsgemm_kernel_L2_M8_22:
  1397. KERNEL8x2_SUB
  1398. KERNEL8x2_SUB
  1399. KERNEL8x2_SUB
  1400. KERNEL8x2_SUB
  1401. KERNEL8x2_SUB
  1402. KERNEL8x2_SUB
  1403. KERNEL8x2_SUB
  1404. KERNEL8x2_SUB
  1405. subs counterL, counterL, #1
  1406. bgt .Lsgemm_kernel_L2_M8_22
  1407. .Lsgemm_kernel_L2_M8_40:
  1408. ands counterL , origK, #7 // counterL = counterL % 8
  1409. ble .Lsgemm_kernel_L2_M8_100
  1410. .Lsgemm_kernel_L2_M8_42:
  1411. KERNEL8x2_SUB
  1412. subs counterL, counterL, #1
  1413. bgt .Lsgemm_kernel_L2_M8_42
  1414. .Lsgemm_kernel_L2_M8_100:
  1415. SAVE8x2
  1416. .Lsgemm_kernel_L2_M8_END:
  1417. subs counterI, counterI, #1
  1418. bgt .Lsgemm_kernel_L2_M8_20
  1419. /******************************************************************************/
  1420. .Lsgemm_kernel_L2_M4_BEGIN:
  1421. mov counterI, origM
  1422. tst counterI , #7
  1423. ble .Lsgemm_kernel_L2_END
  1424. tst counterI, #4
  1425. ble .Lsgemm_kernel_L2_M2_BEGIN
  1426. .Lsgemm_kernel_L2_M4_20:
  1427. INIT4x2
  1428. mov pB, origPB
  1429. asr counterL , origK, #3 // counterL = counterL / 8
  1430. cmp counterL,#0
  1431. ble .Lsgemm_kernel_L2_M4_40
  1432. .align 5
  1433. .Lsgemm_kernel_L2_M4_22:
  1434. KERNEL4x2_SUB
  1435. KERNEL4x2_SUB
  1436. KERNEL4x2_SUB
  1437. KERNEL4x2_SUB
  1438. KERNEL4x2_SUB
  1439. KERNEL4x2_SUB
  1440. KERNEL4x2_SUB
  1441. KERNEL4x2_SUB
  1442. subs counterL, counterL, #1
  1443. bgt .Lsgemm_kernel_L2_M4_22
  1444. .Lsgemm_kernel_L2_M4_40:
  1445. ands counterL , origK, #7 // counterL = counterL % 8
  1446. ble .Lsgemm_kernel_L2_M4_100
  1447. .Lsgemm_kernel_L2_M4_42:
  1448. KERNEL4x2_SUB
  1449. subs counterL, counterL, #1
  1450. bgt .Lsgemm_kernel_L2_M4_42
  1451. .Lsgemm_kernel_L2_M4_100:
  1452. SAVE4x2
  1453. .Lsgemm_kernel_L2_M4_END:
  1454. /******************************************************************************/
  1455. .Lsgemm_kernel_L2_M2_BEGIN:
  1456. mov counterI, origM
  1457. tst counterI , #3
  1458. ble .Lsgemm_kernel_L2_END
  1459. tst counterI, #2 // counterI = counterI / 2
  1460. ble .Lsgemm_kernel_L2_M1_BEGIN
  1461. .Lsgemm_kernel_L2_M2_20:
  1462. INIT2x2
  1463. mov pB, origPB
  1464. asr counterL , origK, #3 // counterL = counterL / 8
  1465. cmp counterL,#0
  1466. ble .Lsgemm_kernel_L2_M2_40
  1467. .Lsgemm_kernel_L2_M2_22:
  1468. KERNEL2x2_SUB
  1469. KERNEL2x2_SUB
  1470. KERNEL2x2_SUB
  1471. KERNEL2x2_SUB
  1472. KERNEL2x2_SUB
  1473. KERNEL2x2_SUB
  1474. KERNEL2x2_SUB
  1475. KERNEL2x2_SUB
  1476. subs counterL, counterL, #1
  1477. bgt .Lsgemm_kernel_L2_M2_22
  1478. .Lsgemm_kernel_L2_M2_40:
  1479. ands counterL , origK, #7 // counterL = counterL % 8
  1480. ble .Lsgemm_kernel_L2_M2_100
  1481. .Lsgemm_kernel_L2_M2_42:
  1482. KERNEL2x2_SUB
  1483. subs counterL, counterL, #1
  1484. bgt .Lsgemm_kernel_L2_M2_42
  1485. .Lsgemm_kernel_L2_M2_100:
  1486. SAVE2x2
  1487. .Lsgemm_kernel_L2_M2_END:
  1488. /******************************************************************************/
  1489. .Lsgemm_kernel_L2_M1_BEGIN:
  1490. tst counterI, #1 // counterI = counterI % 2
  1491. ble .Lsgemm_kernel_L2_END
  1492. .Lsgemm_kernel_L2_M1_20:
  1493. INIT1x2
  1494. mov pB, origPB
  1495. asr counterL , origK, #3 // counterL = counterL / 8
  1496. cmp counterL, #0
  1497. ble .Lsgemm_kernel_L2_M1_40
  1498. .Lsgemm_kernel_L2_M1_22:
  1499. KERNEL1x2_SUB
  1500. KERNEL1x2_SUB
  1501. KERNEL1x2_SUB
  1502. KERNEL1x2_SUB
  1503. KERNEL1x2_SUB
  1504. KERNEL1x2_SUB
  1505. KERNEL1x2_SUB
  1506. KERNEL1x2_SUB
  1507. subs counterL, counterL, #1
  1508. bgt .Lsgemm_kernel_L2_M1_22
  1509. .Lsgemm_kernel_L2_M1_40:
  1510. ands counterL , origK, #7 // counterL = counterL % 8
  1511. ble .Lsgemm_kernel_L2_M1_100
  1512. .Lsgemm_kernel_L2_M1_42:
  1513. KERNEL1x2_SUB
  1514. subs counterL, counterL, #1
  1515. bgt .Lsgemm_kernel_L2_M1_42
  1516. .Lsgemm_kernel_L2_M1_100:
  1517. SAVE1x2
  1518. .Lsgemm_kernel_L2_END:
  1519. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1520. /******************************************************************************/
  1521. /******************************************************************************/
  1522. .Lsgemm_kernel_L1_BEGIN:
  1523. mov counterJ , origN
  1524. tst counterJ , #1
  1525. ble .Lsgemm_kernel_L999 // done
  1526. mov pCRow0, pC // pCRow0 = C
  1527. add pC , pC , LDC // Update pC to point to next
  1528. mov pA, origPA // pA = A
  1529. /******************************************************************************/
  1530. .Lsgemm_kernel_L1_M8_BEGIN:
  1531. mov counterI, origM
  1532. asr counterI, counterI, #3
  1533. cmp counterI, #0
  1534. ble .Lsgemm_kernel_L1_M4_BEGIN
  1535. .Lsgemm_kernel_L1_M8_20:
  1536. INIT8x1
  1537. mov pB, origPB
  1538. asr counterL , origK, #3 // counterL = counterL / 8
  1539. cmp counterL , #0
  1540. ble .Lsgemm_kernel_L1_M8_40
  1541. .align 5
  1542. .Lsgemm_kernel_L1_M8_22:
  1543. KERNEL8x1_SUB
  1544. KERNEL8x1_SUB
  1545. KERNEL8x1_SUB
  1546. KERNEL8x1_SUB
  1547. KERNEL8x1_SUB
  1548. KERNEL8x1_SUB
  1549. KERNEL8x1_SUB
  1550. KERNEL8x1_SUB
  1551. subs counterL, counterL, #1
  1552. bgt .Lsgemm_kernel_L1_M8_22
  1553. .Lsgemm_kernel_L1_M8_40:
  1554. ands counterL , origK, #7 // counterL = counterL % 8
  1555. ble .Lsgemm_kernel_L1_M8_100
  1556. .Lsgemm_kernel_L1_M8_42:
  1557. KERNEL8x1_SUB
  1558. subs counterL, counterL, #1
  1559. bgt .Lsgemm_kernel_L1_M8_42
  1560. .Lsgemm_kernel_L1_M8_100:
  1561. SAVE8x1
  1562. .Lsgemm_kernel_L1_M8_END:
  1563. subs counterI, counterI, #1
  1564. bgt .Lsgemm_kernel_L1_M8_20
  1565. /******************************************************************************/
  1566. .Lsgemm_kernel_L1_M4_BEGIN:
  1567. mov counterI, origM
  1568. tst counterI , #7
  1569. ble .Lsgemm_kernel_L1_END
  1570. tst counterI, #4
  1571. ble .Lsgemm_kernel_L1_M2_BEGIN
  1572. .Lsgemm_kernel_L1_M4_20:
  1573. INIT4x1
  1574. mov pB, origPB
  1575. asr counterL , origK, #3 // counterL = counterL / 8
  1576. cmp counterL , #0
  1577. ble .Lsgemm_kernel_L1_M4_40
  1578. .align 5
  1579. .Lsgemm_kernel_L1_M4_22:
  1580. KERNEL4x1_SUB
  1581. KERNEL4x1_SUB
  1582. KERNEL4x1_SUB
  1583. KERNEL4x1_SUB
  1584. KERNEL4x1_SUB
  1585. KERNEL4x1_SUB
  1586. KERNEL4x1_SUB
  1587. KERNEL4x1_SUB
  1588. subs counterL, counterL, #1
  1589. bgt .Lsgemm_kernel_L1_M4_22
  1590. .Lsgemm_kernel_L1_M4_40:
  1591. ands counterL , origK, #7 // counterL = counterL % 8
  1592. ble .Lsgemm_kernel_L1_M4_100
  1593. .Lsgemm_kernel_L1_M4_42:
  1594. KERNEL4x1_SUB
  1595. subs counterL, counterL, #1
  1596. bgt .Lsgemm_kernel_L1_M4_42
  1597. .Lsgemm_kernel_L1_M4_100:
  1598. SAVE4x1
  1599. .Lsgemm_kernel_L1_M4_END:
  1600. /******************************************************************************/
  1601. .Lsgemm_kernel_L1_M2_BEGIN:
  1602. mov counterI, origM
  1603. tst counterI , #3
  1604. ble .Lsgemm_kernel_L1_END
  1605. tst counterI, #2 // counterI = counterI / 2
  1606. ble .Lsgemm_kernel_L1_M1_BEGIN
  1607. .Lsgemm_kernel_L1_M2_20:
  1608. INIT2x1
  1609. mov pB, origPB
  1610. asr counterL , origK, #3 // counterL = counterL / 8
  1611. cmp counterL , #0
  1612. ble .Lsgemm_kernel_L1_M2_40
  1613. .Lsgemm_kernel_L1_M2_22:
  1614. KERNEL2x1_SUB
  1615. KERNEL2x1_SUB
  1616. KERNEL2x1_SUB
  1617. KERNEL2x1_SUB
  1618. KERNEL2x1_SUB
  1619. KERNEL2x1_SUB
  1620. KERNEL2x1_SUB
  1621. KERNEL2x1_SUB
  1622. subs counterL, counterL, #1
  1623. bgt .Lsgemm_kernel_L1_M2_22
  1624. .Lsgemm_kernel_L1_M2_40:
  1625. ands counterL , origK, #7 // counterL = counterL % 8
  1626. ble .Lsgemm_kernel_L1_M2_100
  1627. .Lsgemm_kernel_L1_M2_42:
  1628. KERNEL2x1_SUB
  1629. subs counterL, counterL, #1
  1630. bgt .Lsgemm_kernel_L1_M2_42
  1631. .Lsgemm_kernel_L1_M2_100:
  1632. SAVE2x1
  1633. .Lsgemm_kernel_L1_M2_END:
  1634. /******************************************************************************/
  1635. .Lsgemm_kernel_L1_M1_BEGIN:
  1636. tst counterI, #1 // counterI = counterI % 2
  1637. ble .Lsgemm_kernel_L1_END
  1638. .Lsgemm_kernel_L1_M1_20:
  1639. INIT1x1
  1640. mov pB, origPB
  1641. asr counterL , origK, #3 // counterL = counterL / 8
  1642. cmp counterL , #0
  1643. ble .Lsgemm_kernel_L1_M1_40
  1644. .Lsgemm_kernel_L1_M1_22:
  1645. KERNEL1x1_SUB
  1646. KERNEL1x1_SUB
  1647. KERNEL1x1_SUB
  1648. KERNEL1x1_SUB
  1649. KERNEL1x1_SUB
  1650. KERNEL1x1_SUB
  1651. KERNEL1x1_SUB
  1652. KERNEL1x1_SUB
  1653. subs counterL, counterL, #1
  1654. bgt .Lsgemm_kernel_L1_M1_22
  1655. .Lsgemm_kernel_L1_M1_40:
  1656. ands counterL , origK, #7 // counterL = counterL % 8
  1657. ble .Lsgemm_kernel_L1_M1_100
  1658. .Lsgemm_kernel_L1_M1_42:
  1659. KERNEL1x1_SUB
  1660. subs counterL, counterL, #1
  1661. bgt .Lsgemm_kernel_L1_M1_42
  1662. .Lsgemm_kernel_L1_M1_100:
  1663. SAVE1x1
  1664. .Lsgemm_kernel_L1_END:
  1665. /******************************************************************************/
  1666. .Lsgemm_kernel_L999:
  1667. mov x0, #0 // set return value
  1668. ldp d8, d9, [sp, #(0 * 16)]
  1669. ldp d10, d11, [sp, #(1 * 16)]
  1670. ldp d12, d13, [sp, #(2 * 16)]
  1671. ldp d14, d15, [sp, #(3 * 16)]
  1672. ldp d16, d17, [sp, #(4 * 16)]
  1673. ldp x18, x19, [sp, #(5 * 16)]
  1674. ldp x20, x21, [sp, #(6 * 16)]
  1675. ldp x22, x23, [sp, #(7 * 16)]
  1676. ldp x24, x25, [sp, #(8 * 16)]
  1677. ldp x26, x27, [sp, #(9 * 16)]
  1678. ldr x28, [sp, #(10 * 16)]
  1679. add sp, sp, #(11*16)
  1680. ret
  1681. EPILOGUE