You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_small_kernel_tn_power10.c 60 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #include "common.h"
  28. #include <altivec.h>
  29. typedef __vector unsigned char vec_t;
  30. #if !defined(B0)
  31. #define SAVE_4x4_ACC(ACC, N, M) \
  32. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  33. rc0 = vec_xl(0, C+(N+0)*ldc+M); \
  34. rc0 = vec_mul(rc0, vbeta); \
  35. result[0] = vec_madd(result[0], valpha, rc0); \
  36. vec_xst(result[0], 0, C+(N+0)*ldc+M); \
  37. rc0 = vec_xl(0, C+(N+1)*ldc+M); \
  38. rc0 = vec_mul(rc0, vbeta); \
  39. result[1] = vec_madd(result[1], valpha, rc0); \
  40. vec_xst(result[1], 0, C+(N+1)*ldc+M); \
  41. rc0 = vec_xl(0, C+(N+2)*ldc+M); \
  42. rc0 = vec_mul(rc0, vbeta); \
  43. result[2] = vec_madd(result[2], valpha, rc0); \
  44. vec_xst(result[2], 0, C+(N+2)*ldc+M); \
  45. rc0 = vec_xl(0, C+(N+3)*ldc+M); \
  46. rc0 = vec_mul(rc0, vbeta); \
  47. result[3] = vec_madd(result[3], valpha, rc0); \
  48. vec_xst(result[3], 0, C+(N+3)*ldc+M);
  49. #define SAVE_4x2_ACC(ACC, N, M) \
  50. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  51. rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \
  52. rc0 = vec_mul(rc0, vbeta); \
  53. result[0] = vec_madd(result[0], valpha, rc0); \
  54. vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
  55. rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \
  56. rc0 = vec_mul(rc0, vbeta); \
  57. result[1] = vec_madd(result[1], valpha, rc0); \
  58. vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
  59. rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \
  60. rc0 = vec_mul(rc0, vbeta); \
  61. result[2] = vec_madd(result[2], valpha, rc0); \
  62. vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
  63. rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \
  64. rc0 = vec_mul(rc0, vbeta); \
  65. result[3] = vec_madd(result[3], valpha, rc0); \
  66. vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
  67. #define SAVE_2x4_ACC(ACC, N, M) \
  68. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  69. rc0 = vec_xl(0, C+(N+0)*ldc+M); \
  70. rc0 = vec_mul(rc0, vbeta); \
  71. result[0] = vec_madd(result[0], valpha, rc0); \
  72. vec_xst(result[0], 0, C+(N+0)*ldc+M); \
  73. rc0 = vec_xl(0, C+(N+1)*ldc+M); \
  74. rc0 = vec_mul(rc0, vbeta); \
  75. result[1] = vec_madd(result[1], valpha, rc0); \
  76. vec_xst(result[1], 0, C+(N+1)*ldc+M);
  77. #define SAVE_1x4_VSR(result, N, M) \
  78. rc0 = vec_xl(0, C+((N)*ldc)+M); \
  79. rc0 = vec_mul(rc0, vbeta); \
  80. result = vec_madd(result, valpha, rc0); \
  81. vec_xst(result, 0, C+((N)*ldc)+M);
  82. #define SAVE_2x2_VSR(result, N, M) \
  83. rc0 = vec_xl_len(C+(N*ldc)+M, 8); \
  84. rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \
  85. rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \
  86. rc0 = vec_mul(rc0, vbeta); \
  87. result = vec_madd(result, valpha, rc0); \
  88. vec_xst_len(result, C+(N*ldc)+M, 8); \
  89. C[(N+1)*ldc+M+0] = result[2]; \
  90. C[(N+1)*ldc+M+1] = result[3];
  91. #define SAVE_1x2_VSR(result, N, M) \
  92. rc0 = vec_xl_len(C+(N*ldc)+M, 8); \
  93. rc0 = vec_mul(rc0, vbeta); \
  94. result = vec_madd(result, valpha, rc0); \
  95. vec_xst_len(result, C+(N*ldc)+M, 8);
  96. #define SAVE_4x1_VSR(result, N, M) \
  97. result = vec_mul(result, valpha); \
  98. C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
  99. C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \
  100. C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \
  101. C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3];
  102. #define SAVE_2x1_VSR(result, N, M) \
  103. result = vec_mul(result, valpha); \
  104. C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
  105. C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
  106. #else
  107. #define SAVE_4x4_ACC(ACC, N, M) \
  108. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  109. result[0] = vec_mul(result[0], valpha); \
  110. vec_xst(result[0], 0, C+(N+0)*ldc+M); \
  111. result[1] = vec_mul(result[1], valpha); \
  112. vec_xst(result[1], 0, C+(N+1)*ldc+M); \
  113. result[2] = vec_mul(result[2], valpha); \
  114. vec_xst(result[2], 0, C+(N+2)*ldc+M); \
  115. result[3] = vec_mul(result[3], valpha); \
  116. vec_xst(result[3], 0, C+(N+3)*ldc+M);
  117. #define SAVE_4x2_ACC(ACC, N, M) \
  118. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  119. result[0] = vec_mul(result[0], valpha); \
  120. vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \
  121. result[1] = vec_mul(result[1], valpha); \
  122. vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \
  123. result[2] = vec_mul(result[2], valpha); \
  124. vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \
  125. result[3] = vec_mul(result[3], valpha); \
  126. vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
  127. #define SAVE_2x4_ACC(ACC, N, M) \
  128. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  129. result[0] = vec_mul(result[0], valpha); \
  130. vec_xst(result[0], 0, C+(N+0)*ldc+M); \
  131. result[1] = vec_mul(result[1], valpha); \
  132. vec_xst(result[1], 0, C+(N+1)*ldc+M);
  133. #define SAVE_1x4_VSR(result, N, M) \
  134. result = vec_mul(result, valpha); \
  135. vec_xst(result, 0, C+((N)*ldc)+M);
  136. #define SAVE_2x2_VSR(result, N, M) \
  137. result = vec_mul(result, valpha); \
  138. vec_xst_len(result, C+(N*ldc)+M, 8); \
  139. C[(N+1)*ldc+M+0] = result[2]; \
  140. C[(N+1)*ldc+M+1] = result[3];
  141. #define SAVE_1x2_VSR(result, N, M) \
  142. result = vec_mul(result, valpha); \
  143. vec_xst_len(result, C+(N*ldc)+M, 8);
  144. #define SAVE_4x1_VSR(result, N, M) \
  145. result = vec_mul(result, valpha); \
  146. C[(N+0)*ldc+M] = result[0]; \
  147. C[(N+1)*ldc+M] = result[1]; \
  148. C[(N+2)*ldc+M] = result[2]; \
  149. C[(N+3)*ldc+M] = result[3];
  150. #define SAVE_2x1_VSR(result, N, M) \
  151. result = vec_mul(result, valpha); \
  152. C[(N+0)*ldc+M] = result[0]; \
  153. C[(N+1)*ldc+M] = result[1];
  154. #endif
  155. #define INIT_8ACCS() \
  156. __builtin_mma_xxsetaccz(&acc0); \
  157. __builtin_mma_xxsetaccz(&acc1); \
  158. __builtin_mma_xxsetaccz(&acc2); \
  159. __builtin_mma_xxsetaccz(&acc3); \
  160. __builtin_mma_xxsetaccz(&acc4); \
  161. __builtin_mma_xxsetaccz(&acc5); \
  162. __builtin_mma_xxsetaccz(&acc6); \
  163. __builtin_mma_xxsetaccz(&acc7);
  164. #define INIT_4ACCS() \
  165. __builtin_mma_xxsetaccz(&acc0); \
  166. __builtin_mma_xxsetaccz(&acc1); \
  167. __builtin_mma_xxsetaccz(&acc2); \
  168. __builtin_mma_xxsetaccz(&acc3);
  169. #define INIT_2ACCS() \
  170. __builtin_mma_xxsetaccz(&acc0); \
  171. __builtin_mma_xxsetaccz(&acc1);
  172. #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
  173. #define LOAD_AT_16x4(M, K) \
  174. ra0 = vec_xl(0, A+(M+0)*lda+K); \
  175. ra1 = vec_xl(0, A+(M+1)*lda+K); \
  176. t0 = vec_mergeh(ra0, ra1); \
  177. t1 = vec_mergel(ra0, ra1); \
  178. ra2 = vec_xl(0, A+(M+2)*lda+K); \
  179. ra3 = vec_xl(0, A+(M+3)*lda+K); \
  180. t2 = vec_mergeh(ra2, ra3); \
  181. t3 = vec_mergel(ra2, ra3); \
  182. ra0 = vec_xxpermdi(t0, t2, 0b00); \
  183. ra1 = vec_xxpermdi(t0, t2, 0b11); \
  184. ra2 = vec_xxpermdi(t1, t3, 0b00); \
  185. ra3 = vec_xxpermdi(t1, t3, 0b11); \
  186. ra4 = vec_xl(0, A+(M+4)*lda+K); \
  187. ra5 = vec_xl(0, A+(M+5)*lda+K); \
  188. t0 = vec_mergeh(ra4, ra5); \
  189. t1 = vec_mergel(ra4, ra5); \
  190. ra6 = vec_xl(0, A+(M+6)*lda+K); \
  191. ra7 = vec_xl(0, A+(M+7)*lda+K); \
  192. t2 = vec_mergeh(ra6, ra7); \
  193. t3 = vec_mergel(ra6, ra7); \
  194. ra4 = vec_xxpermdi(t0, t2, 0b00); \
  195. ra5 = vec_xxpermdi(t0, t2, 0b11); \
  196. ra6 = vec_xxpermdi(t1, t3, 0b00); \
  197. ra7 = vec_xxpermdi(t1, t3, 0b11); \
  198. ra8 = vec_xl(0, A+(M+8)*lda+K); \
  199. ra9 = vec_xl(0, A+(M+9)*lda+K); \
  200. t0 = vec_mergeh(ra8, ra9); \
  201. t1 = vec_mergel(ra8, ra9); \
  202. ra10 = vec_xl(0, A+(M+10)*lda+K); \
  203. ra11 = vec_xl(0, A+(M+11)*lda+K); \
  204. t2 = vec_mergeh(ra10, ra11); \
  205. t3 = vec_mergel(ra10, ra11); \
  206. ra8 = vec_xxpermdi(t0, t2, 0b00); \
  207. ra9 = vec_xxpermdi(t0, t2, 0b11); \
  208. ra10 = vec_xxpermdi(t1, t3, 0b00); \
  209. ra11 = vec_xxpermdi(t1, t3, 0b11); \
  210. ra12 = vec_xl(0, A+(M+12)*lda+K); \
  211. ra13 = vec_xl(0, A+(M+13)*lda+K); \
  212. t0 = vec_mergeh(ra12, ra13); \
  213. t1 = vec_mergel(ra12, ra13); \
  214. ra14 = vec_xl(0, A+(M+14)*lda+K); \
  215. ra15 = vec_xl(0, A+(M+15)*lda+K); \
  216. t2 = vec_mergeh(ra14, ra15); \
  217. t3 = vec_mergel(ra14, ra15); \
  218. ra12 = vec_xxpermdi(t0, t2, 0b00); \
  219. ra13 = vec_xxpermdi(t0, t2, 0b11); \
  220. ra14 = vec_xxpermdi(t1, t3, 0b00); \
  221. ra15 = vec_xxpermdi(t1, t3, 0b11);
  222. #define LOAD_AT_16x2(M, K) \
  223. ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
  224. ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
  225. t0 = vec_mergeh(ra0, ra1); \
  226. ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \
  227. ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \
  228. t1 = vec_mergeh(ra2, ra3); \
  229. ra0 = vec_xxpermdi(t0, t1, 0b00); \
  230. ra1 = vec_xxpermdi(t0, t1, 0b11); \
  231. ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \
  232. ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \
  233. t0 = vec_mergeh(ra4, ra5); \
  234. ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \
  235. ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \
  236. t1 = vec_mergeh(ra6, ra7); \
  237. ra2 = vec_xxpermdi(t0, t1, 0b00); \
  238. ra3 = vec_xxpermdi(t0, t1, 0b11); \
  239. ra8 = vec_xl_len(A+(M+8)*lda+K, 8); \
  240. ra9 = vec_xl_len(A+(M+9)*lda+K, 8); \
  241. t0 = vec_mergeh(ra8, ra9); \
  242. ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \
  243. ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \
  244. t1 = vec_mergeh(ra10, ra11); \
  245. ra4 = vec_xxpermdi(t0, t1, 0b00); \
  246. ra5 = vec_xxpermdi(t0, t1, 0b11); \
  247. ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \
  248. ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \
  249. t0 = vec_mergeh(ra12, ra13); \
  250. ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \
  251. ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \
  252. t1 = vec_mergeh(ra14, ra15); \
  253. ra6 = vec_xxpermdi(t0, t1, 0b00); \
  254. ra7 = vec_xxpermdi(t0, t1, 0b11);
  255. #define LOAD_AT_16x1(M, K) \
  256. ra0 = vec_xor(ra0, ra0); \
  257. ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
  258. ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
  259. ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \
  260. ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \
  261. ra1 = vec_xor(ra1, ra1); \
  262. ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \
  263. ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \
  264. ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \
  265. ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); \
  266. ra2 = vec_xor(ra2, ra2); \
  267. ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0); \
  268. ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1); \
  269. ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2); \
  270. ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3); \
  271. ra3 = vec_xor(ra3, ra3); \
  272. ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0); \
  273. ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1); \
  274. ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2); \
  275. ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3);
  276. #define LOAD_AT_8x4(M, K) \
  277. ra0 = vec_xl(0, A+(M+0)*lda+K); \
  278. ra1 = vec_xl(0, A+(M+1)*lda+K); \
  279. t0 = vec_mergeh(ra0, ra1); \
  280. t1 = vec_mergel(ra0, ra1); \
  281. ra2 = vec_xl(0, A+(M+2)*lda+K); \
  282. ra3 = vec_xl(0, A+(M+3)*lda+K); \
  283. t2 = vec_mergeh(ra2, ra3); \
  284. t3 = vec_mergel(ra2, ra3); \
  285. ra0 = vec_xxpermdi(t0, t2, 0b00); \
  286. ra1 = vec_xxpermdi(t0, t2, 0b11); \
  287. ra2 = vec_xxpermdi(t1, t3, 0b00); \
  288. ra3 = vec_xxpermdi(t1, t3, 0b11); \
  289. ra4 = vec_xl(0, A+(M+4)*lda+K); \
  290. ra5 = vec_xl(0, A+(M+5)*lda+K); \
  291. t0 = vec_mergeh(ra4, ra5); \
  292. t1 = vec_mergel(ra4, ra5); \
  293. ra6 = vec_xl(0, A+(M+6)*lda+K); \
  294. ra7 = vec_xl(0, A+(M+7)*lda+K); \
  295. t2 = vec_mergeh(ra6, ra7); \
  296. t3 = vec_mergel(ra6, ra7); \
  297. ra4 = vec_xxpermdi(t0, t2, 0b00); \
  298. ra5 = vec_xxpermdi(t0, t2, 0b11); \
  299. ra6 = vec_xxpermdi(t1, t3, 0b00); \
  300. ra7 = vec_xxpermdi(t1, t3, 0b11);
  301. #define LOAD_AT_8x2(M, K) \
  302. ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
  303. ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
  304. t0 = vec_mergeh(ra0, ra1); \
  305. ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \
  306. ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \
  307. t1 = vec_mergeh(ra2, ra3); \
  308. ra0 = vec_xxpermdi(t0, t1, 0b00); \
  309. ra1 = vec_xxpermdi(t0, t1, 0b11); \
  310. ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \
  311. ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \
  312. t0 = vec_mergeh(ra4, ra5); \
  313. ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \
  314. ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \
  315. t1 = vec_mergeh(ra6, ra7); \
  316. ra2 = vec_xxpermdi(t0, t1, 0b00); \
  317. ra3 = vec_xxpermdi(t0, t1, 0b11);
  318. #define LOAD_AT_8x1(M, K) \
  319. ra0 = vec_xor(ra0, ra0); \
  320. ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
  321. ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
  322. ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \
  323. ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \
  324. ra1 = vec_xor(ra1, ra1); \
  325. ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \
  326. ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \
  327. ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \
  328. ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3);
  329. #define LOAD_AT_4x4(M, K) \
  330. ra0 = vec_xl(0, A+(M+0)*lda+K); \
  331. ra1 = vec_xl(0, A+(M+1)*lda+K); \
  332. t0 = vec_mergeh(ra0, ra1); \
  333. t1 = vec_mergel(ra0, ra1); \
  334. ra2 = vec_xl(0, A+(M+2)*lda+K); \
  335. ra3 = vec_xl(0, A+(M+3)*lda+K); \
  336. t2 = vec_mergeh(ra2, ra3); \
  337. t3 = vec_mergel(ra2, ra3); \
  338. ra0 = vec_xxpermdi(t0, t2, 0b00); \
  339. ra1 = vec_xxpermdi(t0, t2, 0b11); \
  340. ra2 = vec_xxpermdi(t1, t3, 0b00); \
  341. ra3 = vec_xxpermdi(t1, t3, 0b11);
  342. #define LOAD_AT_4x2(M, K) \
  343. ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
  344. ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
  345. t0 = vec_mergeh(ra0, ra1); \
  346. ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \
  347. ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \
  348. t1 = vec_mergeh(ra2, ra3); \
  349. ra0 = vec_xxpermdi(t0, t1, 0b00); \
  350. ra1 = vec_xxpermdi(t0, t1, 0b11);
  351. #define LOAD_AT_4x1(M, K) \
  352. ra0 = vec_xor(ra0, ra0); \
  353. ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
  354. ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
  355. ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \
  356. ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3);
  357. #define LOAD_AT_2x4(M, K) \
  358. ra0 = vec_xl(0, A+(M+0)*lda+K); \
  359. ra1 = vec_xl(0, A+(M+1)*lda+K); \
  360. t0 = vec_mergeh(ra0, ra1); \
  361. t1 = vec_mergeo(ra0, ra1); \
  362. t2 = vec_mergel(ra0, ra1); \
  363. ra0 = t0; \
  364. ra1 = t1; \
  365. ra2 = t2; \
  366. ra3 = vec_xor(ra3, ra3); \
  367. ra3 = vec_insert(vec_extract(t2, 2), ra3, 0); \
  368. ra3 = vec_insert(vec_extract(t2, 3), ra3, 1);
  369. #define LOAD_AT_2x2(M, K) \
  370. ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
  371. ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
  372. t0 = vec_mergee(ra0, ra1); \
  373. t1 = vec_mergeo(ra0, ra1); \
  374. ra0 = t0; \
  375. ra1 = t1;
  376. #define LOAD_AT_2x1(M, K) \
  377. ra0 = vec_xor(ra0, ra0); \
  378. ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
  379. ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
  380. #define LOAD_A_2x2(M, K) \
  381. ra0 = vec_splats(A[(M+0)*lda+K]); \
  382. ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
  383. ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3);
  384. #define LOAD_A_2x1(M, K) \
  385. ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
  386. ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
  387. #define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);
  388. #define LOAD_BT_16x4(N, K) \
  389. rb0 = vec_xl(0, B+(N+0)*ldb+K); \
  390. rb1 = vec_xl(0, B+(N+1)*ldb+K); \
  391. t0 = vec_mergeh(rb0, rb1); \
  392. t1 = vec_mergel(rb0, rb1); \
  393. rb2 = vec_xl(0, B+(N+2)*ldb+K); \
  394. rb3 = vec_xl(0, B+(N+3)*ldb+K); \
  395. t2 = vec_mergeh(rb2, rb3); \
  396. t3 = vec_mergel(rb2, rb3); \
  397. rb0 = vec_xxpermdi(t0, t2, 0b00); \
  398. rb1 = vec_xxpermdi(t0, t2, 0b11); \
  399. rb2 = vec_xxpermdi(t1, t3, 0b00); \
  400. rb3 = vec_xxpermdi(t1, t3, 0b11); \
  401. rb4 = vec_xl(0, B+(N+4)*ldb+K); \
  402. rb5 = vec_xl(0, B+(N+5)*ldb+K); \
  403. t0 = vec_mergeh(rb4, rb5); \
  404. t1 = vec_mergel(rb4, rb5); \
  405. rb6 = vec_xl(0, B+(N+6)*ldb+K); \
  406. rb7 = vec_xl(0, B+(N+7)*ldb+K); \
  407. t2 = vec_mergeh(rb6, rb7); \
  408. t3 = vec_mergel(rb6, rb7); \
  409. rb4 = vec_xxpermdi(t0, t2, 0b00); \
  410. rb5 = vec_xxpermdi(t0, t2, 0b11); \
  411. rb6 = vec_xxpermdi(t1, t3, 0b00); \
  412. rb7 = vec_xxpermdi(t1, t3, 0b11); \
  413. rb8 = vec_xl(0, B+(N+8)*ldb+K); \
  414. rb9 = vec_xl(0, B+(N+9)*ldb+K); \
  415. t0 = vec_mergeh(rb8, rb9); \
  416. t1 = vec_mergel(rb8, rb9); \
  417. rb10 = vec_xl(0, B+(N+10)*ldb+K); \
  418. rb11 = vec_xl(0, B+(N+11)*ldb+K); \
  419. t2 = vec_mergeh(rb10, rb11); \
  420. t3 = vec_mergel(rb10, rb11); \
  421. rb8 = vec_xxpermdi(t0, t2, 0b00); \
  422. rb9 = vec_xxpermdi(t0, t2, 0b11); \
  423. rb10 = vec_xxpermdi(t1, t3, 0b00); \
  424. rb11 = vec_xxpermdi(t1, t3, 0b11); \
  425. rb12 = vec_xl(0, B+(N+12)*ldb+K); \
  426. rb13 = vec_xl(0, B+(N+13)*ldb+K); \
  427. t0 = vec_mergeh(rb12, rb13); \
  428. t1 = vec_mergel(rb12, rb13); \
  429. rb14 = vec_xl(0, B+(N+14)*ldb+K); \
  430. rb15 = vec_xl(0, B+(N+15)*ldb+K); \
  431. t2 = vec_mergeh(rb14, rb15); \
  432. t3 = vec_mergel(rb14, rb15); \
  433. rb12 = vec_xxpermdi(t0, t2, 0b00); \
  434. rb13 = vec_xxpermdi(t0, t2, 0b11); \
  435. rb14 = vec_xxpermdi(t1, t3, 0b00); \
  436. rb15 = vec_xxpermdi(t1, t3, 0b11);
  437. #define LOAD_BT_16x2(N, K) \
  438. rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
  439. rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
  440. rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \
  441. rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \
  442. t0 = vec_mergeh(rb0, rb1); \
  443. t1 = vec_mergeh(rb2, rb3); \
  444. rb0 = vec_xxpermdi(t0, t1, 0b00); \
  445. rb1 = vec_xxpermdi(t0, t1, 0b11); \
  446. rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \
  447. rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \
  448. rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \
  449. rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \
  450. t0 = vec_mergeh(rb4, rb5); \
  451. t1 = vec_mergeh(rb6, rb7); \
  452. rb2 = vec_xxpermdi(t0, t1, 0b00); \
  453. rb3 = vec_xxpermdi(t0, t1, 0b11); \
  454. rb8 = vec_xl_len(B+(N+8)*ldb+K, 8); \
  455. rb9 = vec_xl_len(B+(N+9)*ldb+K, 8); \
  456. rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \
  457. rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \
  458. t0 = vec_mergeh(rb8, rb9); \
  459. t1 = vec_mergeh(rb10, rb11); \
  460. rb4 = vec_xxpermdi(t0, t1, 0b00); \
  461. rb5 = vec_xxpermdi(t0, t1, 0b11); \
  462. rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \
  463. rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \
  464. rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \
  465. rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \
  466. t0 = vec_mergeh(rb12, rb13); \
  467. t1 = vec_mergeh(rb14, rb15); \
  468. rb6 = vec_xxpermdi(t0, t1, 0b00); \
  469. rb7 = vec_xxpermdi(t0, t1, 0b11);
  470. #define LOAD_BT_16x1(N, K) \
  471. rb0 = vec_xor(rb0, rb0); \
  472. rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
  473. rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
  474. rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \
  475. rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \
  476. rb1 = vec_xor(rb1, rb1); \
  477. rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \
  478. rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \
  479. rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \
  480. rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); \
  481. rb2 = vec_xor(rb2, rb2); \
  482. rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0); \
  483. rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1); \
  484. rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2); \
  485. rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3); \
  486. rb3 = vec_xor(rb3, rb3); \
  487. rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0); \
  488. rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1); \
  489. rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2); \
  490. rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3);
  491. #define LOAD_BT_8x4(N, K) \
  492. rb0 = vec_xl(0, B+(N+0)*ldb+K); \
  493. rb1 = vec_xl(0, B+(N+1)*ldb+K); \
  494. t0 = vec_mergeh(rb0, rb1); \
  495. t1 = vec_mergel(rb0, rb1); \
  496. rb2 = vec_xl(0, B+(N+2)*ldb+K); \
  497. rb3 = vec_xl(0, B+(N+3)*ldb+K); \
  498. t2 = vec_mergeh(rb2, rb3); \
  499. t3 = vec_mergel(rb2, rb3); \
  500. rb0 = vec_xxpermdi(t0, t2, 0b00); \
  501. rb1 = vec_xxpermdi(t0, t2, 0b11); \
  502. rb2 = vec_xxpermdi(t1, t3, 0b00); \
  503. rb3 = vec_xxpermdi(t1, t3, 0b11); \
  504. rb4 = vec_xl(0, B+(N+4)*ldb+K); \
  505. rb5 = vec_xl(0, B+(N+5)*ldb+K); \
  506. t0 = vec_mergeh(rb4, rb5); \
  507. t1 = vec_mergel(rb4, rb5); \
  508. rb6 = vec_xl(0, B+(N+6)*ldb+K); \
  509. rb7 = vec_xl(0, B+(N+7)*ldb+K); \
  510. t2 = vec_mergeh(rb6, rb7); \
  511. t3 = vec_mergel(rb6, rb7); \
  512. rb4 = vec_xxpermdi(t0, t2, 0b00); \
  513. rb5 = vec_xxpermdi(t0, t2, 0b11); \
  514. rb6 = vec_xxpermdi(t1, t3, 0b00); \
  515. rb7 = vec_xxpermdi(t1, t3, 0b11);
  516. #define LOAD_BT_8x2(N, K) \
  517. rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
  518. rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
  519. t0 = vec_mergeh(rb0, rb1); \
  520. rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \
  521. rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \
  522. t1 = vec_mergeh(rb2, rb3); \
  523. rb0 = vec_xxpermdi(t0, t1, 0b00); \
  524. rb1 = vec_xxpermdi(t0, t1, 0b11); \
  525. rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \
  526. rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \
  527. t0 = vec_mergeh(rb4, rb5); \
  528. rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \
  529. rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \
  530. t1 = vec_mergeh(rb6, rb7); \
  531. rb2 = vec_xxpermdi(t0, t1, 0b00); \
  532. rb3 = vec_xxpermdi(t0, t1, 0b11);
  533. #define LOAD_BT_8x1(N, K) \
  534. rb0 = vec_xor(rb0, rb0); \
  535. rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
  536. rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
  537. rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \
  538. rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \
  539. rb1 = vec_xor(rb1, rb1); \
  540. rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \
  541. rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \
  542. rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \
  543. rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3);
  544. #define LOAD_BT_4x4(N, K) \
  545. rb0 = vec_xl(0, B+(N+0)*ldb+K); \
  546. rb1 = vec_xl(0, B+(N+1)*ldb+K); \
  547. t0 = vec_mergeh(rb0, rb1); \
  548. t1 = vec_mergel(rb0, rb1); \
  549. rb2 = vec_xl(0, B+(N+2)*ldb+K); \
  550. rb3 = vec_xl(0, B+(N+3)*ldb+K); \
  551. t2 = vec_mergeh(rb2, rb3); \
  552. t3 = vec_mergel(rb2, rb3); \
  553. rb0 = vec_xxpermdi(t0, t2, 0b00); \
  554. rb1 = vec_xxpermdi(t0, t2, 0b11); \
  555. rb2 = vec_xxpermdi(t1, t3, 0b00); \
  556. rb3 = vec_xxpermdi(t1, t3, 0b11);
  557. #define LOAD_BT_4x2(N, K) \
  558. rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
  559. rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
  560. t0 = vec_mergeh(rb0, rb1); \
  561. rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \
  562. rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \
  563. t1 = vec_mergeh(rb2, rb3); \
  564. rb0 = vec_xxpermdi(t0, t1, 0b00); \
  565. rb1 = vec_xxpermdi(t0, t1, 0b11);
  566. #define LOAD_BT_4x1(N, K) \
  567. rb0 = vec_xor(rb0, rb0); \
  568. rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
  569. rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
  570. rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \
  571. rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3);
  572. #define LOAD_BT_2x4(N, K) \
  573. rb0 = vec_xl(0, B+(N+0)*ldb+K); \
  574. rb1 = vec_xl(0, B+(N+1)*ldb+K); \
  575. t0 = vec_mergeh(rb0, rb1); \
  576. t1 = vec_mergeo(rb0, rb1); \
  577. t2 = vec_mergel(rb0, rb1); \
  578. rb0 = t0; \
  579. rb1 = t1; \
  580. rb2 = t2; \
  581. rb3 = vec_xor(rb3, rb3); \
  582. rb3 = vec_insert(vec_extract(t2,2), rb3, 0); \
  583. rb3 = vec_insert(vec_extract(t2,3), rb3, 1);
  584. #define LOAD_BT_2x2(N, K) \
  585. rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
  586. rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
  587. t0 = vec_mergee(rb0, rb1); \
  588. t1 = vec_mergeo(rb0, rb1); \
  589. rb0 = t0; \
  590. rb1 = t1;
  591. #define LOAD_BT_2x1(N, K) \
  592. rb0 = vec_xor(rb0, rb0); \
  593. rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
  594. rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);
  595. #define LOAD_B_2x2(N, K) \
  596. rb0 = vec_splats(B[(N+0)*ldb+K]); \
  597. rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \
  598. rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3);
  599. #define LOAD_B_2x1(N, K) \
  600. rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
  601. rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);
  602. #define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]);
  603. #define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
  604. a0, a1, a2, a3, a4, a5, a6, a7) \
  605. __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
  606. __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
  607. __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
  608. __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \
  609. __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \
  610. __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \
  611. __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \
  612. __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7);
  613. #define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
  614. __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
  615. __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
  616. __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
  617. __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);
  618. #define KERNEL_MMA_2ACC(b0, b1, a0, a1) \
  619. __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
  620. __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);
  621. #define KERNEL_MMA_1ACC(b0, a0) \
  622. __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);
  623. #define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
  624. result = vec_madd(a0, b0, result); \
  625. result1 = vec_madd(a1, b1, result1); \
  626. result2 = vec_madd(a2, b2, result2); \
  627. result3 = vec_madd(a3, b3, result3);
  628. #define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
  629. result = vec_madd(a0, b0, result); \
  630. result1 = vec_madd(a1, b1, result1);
  631. #define KERNEL_VMADD_1VSR(a0, b0) \
  632. result = vec_madd(a0, b0, result);
  633. #define PACK_A(ra0, ra1, ra2, ra3, offset) \
  634. vec_xst(ra0, 0, packA+(k*16)+0+offset); \
  635. vec_xst(ra1, 0, packA+(k*16)+4+offset); \
  636. vec_xst(ra2, 0, packA+(k*16)+8+offset); \
  637. vec_xst(ra3, 0, packA+(k*16)+12+offset);
  638. #define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
  639. ra0 = vec_xl(0, packA+(k*16)+0+offset); \
  640. ra1 = vec_xl(0, packA+(k*16)+4+offset); \
  641. ra2 = vec_xl(0, packA+(k*16)+8+offset); \
  642. ra3 = vec_xl(0, packA+(k*16)+12+offset);
  643. #ifdef B0
  644. int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
  645. #else
  646. int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
  647. #endif
  648. {
  649. BLASLONG m, n, k;
  650. BLASLONG m16 = M & ~15;
  651. BLASLONG m8 = M & ~7;
  652. BLASLONG m4 = M & ~3;
  653. BLASLONG m2 = M & ~1;
  654. BLASLONG n16 = N & ~15;
  655. BLASLONG n8 = N & ~7;
  656. BLASLONG n4 = N & ~3;
  657. BLASLONG n2 = N & ~1;
  658. BLASLONG k4 = K & ~3;
  659. BLASLONG k2 = K & ~1;
  660. vector float valpha = vec_splats(alpha);
  661. #if !defined(B0)
  662. vector float vbeta = vec_splats(beta);
  663. #endif
  664. #if defined(__GNUC__) && !defined(__clang__)
  665. int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
  666. #else
  667. int has_packing = 0;
  668. #endif
  669. float *packA;
  670. if (has_packing) packA = (float *)malloc(K*16*sizeof(float));
  671. for (m = 0; m < m16; m += 16) {
  672. for (n = 0; n < n8; n += 8) {
  673. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  674. INIT_8ACCS();
  675. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
  676. ra10, ra11, ra12, ra13, ra14, ra15;
  677. register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
  678. register vector float t0, t1, t2, t3;
  679. if (has_packing) {
  680. if (n == 0) {
  681. for (k = 0; k < k4; k += 4) {
  682. LOAD_AT_16x4(m, k);
  683. LOAD_BT_8x4(n, k);
  684. KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4,
  685. ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12);
  686. PACK_A(ra0, ra4, ra8, ra12, 0);
  687. KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5,
  688. ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13);
  689. PACK_A(ra1, ra5, ra9, ra13, 16);
  690. KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6,
  691. ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14);
  692. PACK_A(ra2, ra6, ra10, ra14, 32);
  693. KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7,
  694. ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15);
  695. PACK_A(ra3, ra7, ra11, ra15, 48);
  696. }
  697. for (; k < k2; k += 2) {
  698. LOAD_AT_16x2(m, k);
  699. LOAD_BT_8x2(n, k);
  700. KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2,
  701. ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6);
  702. PACK_A(ra0, ra2, ra4, ra6, 0);
  703. KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3,
  704. ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7);
  705. PACK_A(ra1, ra3, ra5, ra7, 16);
  706. }
  707. for (; k < K; k++) {
  708. LOAD_AT_16x1(m, k);
  709. LOAD_BT_8x1(n, k);
  710. KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1,
  711. ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3);
  712. PACK_A(ra0, ra1, ra2, ra3, 0);
  713. }
  714. } else {
  715. for (k = 0; k < k4; k += 4) {
  716. LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
  717. LOAD_BT_8x4(n, k);
  718. KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4,
  719. ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12);
  720. LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
  721. KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5,
  722. ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13);
  723. LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
  724. KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6,
  725. ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14);
  726. LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
  727. KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7,
  728. ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15);
  729. }
  730. for (; k < k2; k += 2) {
  731. LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
  732. LOAD_BT_8x2(n, k);
  733. KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2,
  734. ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6);
  735. LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
  736. KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3,
  737. ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7);
  738. }
  739. for (; k < K; k++) {
  740. LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
  741. LOAD_BT_8x1(n, k);
  742. KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1,
  743. ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3);
  744. }
  745. }
  746. } else {
  747. for (k = 0; k < k4; k += 4) {
  748. LOAD_AT_16x4(m, k);
  749. LOAD_BT_8x4(n, k);
  750. KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4,
  751. ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12);
  752. KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5,
  753. ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13);
  754. KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6,
  755. ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14);
  756. KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7,
  757. ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15);
  758. }
  759. for (; k < k2; k += 2) {
  760. LOAD_AT_16x2(m, k);
  761. LOAD_BT_8x2(n, k);
  762. KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2,
  763. ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6);
  764. KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3,
  765. ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7);
  766. }
  767. for (; k < K; k++) {
  768. LOAD_AT_16x1(m, k);
  769. LOAD_BT_8x1(n, k);
  770. KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1,
  771. ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3);
  772. }
  773. }
  774. #if !defined(B0)
  775. register vector float rc0;
  776. #endif
  777. vector float result[4];
  778. SAVE_4x4_ACC(&acc0, n+0, m+0);
  779. SAVE_4x4_ACC(&acc1, n+0, m+4);
  780. SAVE_4x4_ACC(&acc4, n+0, m+8);
  781. SAVE_4x4_ACC(&acc5, n+0, m+12);
  782. SAVE_4x4_ACC(&acc2, n+4, m+0);
  783. SAVE_4x4_ACC(&acc3, n+4, m+4);
  784. SAVE_4x4_ACC(&acc6, n+4, m+8);
  785. SAVE_4x4_ACC(&acc7, n+4, m+12);
  786. }
  787. for (; n < n4; n += 4) {
  788. __vector_quad acc0, acc1, acc2, acc3;
  789. INIT_4ACCS();
  790. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
  791. ra10, ra11, ra12, ra13, ra14, ra15;
  792. register vector float rb0, rb1, rb2, rb3;
  793. register vector float t0, t1, t2, t3;
  794. if (!has_packing) {
  795. for (k = 0; k < k4; k += 4) {
  796. LOAD_AT_16x4(m, k);
  797. LOAD_BT_4x4(n, k);
  798. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
  799. KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13);
  800. KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14);
  801. KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15);
  802. }
  803. for (; k < k2; k += 2) {
  804. LOAD_AT_16x2(m, k);
  805. LOAD_BT_4x2(n, k);
  806. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
  807. KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7);
  808. }
  809. for (; k < K; k++) {
  810. LOAD_AT_16x1(m, k);
  811. LOAD_BT_4x1(n, k);
  812. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
  813. }
  814. } else {
  815. for (k = 0; k < k4; k += 4) {
  816. LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
  817. LOAD_BT_4x4(n, k);
  818. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
  819. LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
  820. KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13);
  821. LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
  822. KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14);
  823. LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
  824. KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15);
  825. }
  826. for (; k < k2; k += 2) {
  827. LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
  828. LOAD_BT_4x2(n, k);
  829. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
  830. LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
  831. KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7);
  832. }
  833. for (; k < K; k++) {
  834. LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
  835. LOAD_BT_4x1(n, k);
  836. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
  837. }
  838. }
  839. #if !defined(B0)
  840. register vector float rc0;
  841. #endif
  842. vector float result[4];
  843. SAVE_4x4_ACC(&acc0, n+0, m+0);
  844. SAVE_4x4_ACC(&acc1, n+0, m+4);
  845. SAVE_4x4_ACC(&acc2, n+0, m+8);
  846. SAVE_4x4_ACC(&acc3, n+0, m+12);
  847. }
  848. for (; n < n2; n += 2) {
  849. __vector_quad acc0, acc1, acc2, acc3;
  850. INIT_4ACCS();
  851. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
  852. ra10, ra11, ra12, ra13, ra14, ra15;
  853. register vector float rb0, rb1, rb2, rb3;
  854. register vector float t0, t1, t2, t3;
  855. if (!has_packing) {
  856. for (k = 0; k < k4; k += 4) {
  857. LOAD_AT_16x4(m, k);
  858. LOAD_BT_2x4(n, k);
  859. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
  860. KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13);
  861. KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14);
  862. KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15);
  863. }
  864. for (; k < k2; k += 2) {
  865. LOAD_AT_16x2(m, k);
  866. LOAD_BT_2x2(n, k);
  867. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
  868. KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7);
  869. }
  870. for (; k < K; k++) {
  871. LOAD_AT_16x1(m, k);
  872. LOAD_BT_2x1(n, k);
  873. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
  874. }
  875. } else {
  876. for (k = 0; k < k4; k += 4) {
  877. LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
  878. LOAD_BT_2x4(n, k);
  879. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
  880. LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
  881. KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13);
  882. LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
  883. KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14);
  884. LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
  885. KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15);
  886. }
  887. for (; k < k2; k += 2) {
  888. LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
  889. LOAD_BT_2x2(n, k);
  890. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
  891. LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
  892. KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7);
  893. }
  894. for (; k < K; k++) {
  895. LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
  896. LOAD_BT_2x1(n, k);
  897. KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
  898. }
  899. }
  900. #if !defined(B0)
  901. register vector float rc0;
  902. #endif
  903. vector float result[4];
  904. SAVE_2x4_ACC(&acc0, n+0, m+0);
  905. SAVE_2x4_ACC(&acc1, n+0, m+4);
  906. SAVE_2x4_ACC(&acc2, n+0, m+8);
  907. SAVE_2x4_ACC(&acc3, n+0, m+12);
  908. }
  909. for (; n < N; n++) {
  910. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
  911. ra10, ra11, ra12, ra13, ra14, ra15;
  912. register vector float rb0;
  913. register vector float t0, t1, t2, t3;
  914. vector float result = ((vector float){0.,0.,0.,0.});
  915. vector float result1 = ((vector float){0.,0.,0.,0.});
  916. vector float result2 = ((vector float){0.,0.,0.,0.});
  917. vector float result3 = ((vector float){0.,0.,0.,0.});
  918. if (!has_packing) {
  919. for (k = 0; k < k4; k += 4) {
  920. LOAD_AT_16x4(m, k);
  921. LOAD_B_1x1(n, k);
  922. KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0);
  923. LOAD_B_1x1(n, k+1);
  924. KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0);
  925. LOAD_B_1x1(n, k+2);
  926. KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0);
  927. LOAD_B_1x1(n, k+3);
  928. KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0);
  929. }
  930. for (; k < k2; k += 2) {
  931. LOAD_AT_16x2(m, k);
  932. LOAD_B_1x1(n, k);
  933. KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0);
  934. LOAD_B_1x1(n, k+1);
  935. KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0);
  936. }
  937. for (; k < K; k++) {
  938. LOAD_AT_16x1(m, k);
  939. LOAD_B_1x1(n, k);
  940. KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
  941. }
  942. } else {
  943. for (k = 0; k < k4; k += 4) {
  944. LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
  945. LOAD_B_1x1(n, k);
  946. KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0);
  947. LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
  948. LOAD_B_1x1(n, k+1);
  949. KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0);
  950. LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
  951. LOAD_B_1x1(n, k+2);
  952. KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0);
  953. LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
  954. LOAD_B_1x1(n, k+3);
  955. KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0);
  956. }
  957. for (; k < k2; k += 2) {
  958. LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
  959. LOAD_B_1x1(n, k);
  960. KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0);
  961. LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
  962. LOAD_B_1x1(n, k+1);
  963. KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0);
  964. }
  965. for (; k < K; k++) {
  966. LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
  967. LOAD_B_1x1(n, k);
  968. KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
  969. }
  970. }
  971. #if !defined(B0)
  972. register vector float rc0;
  973. #endif
  974. SAVE_1x4_VSR(result, n, m+0);
  975. SAVE_1x4_VSR(result1, n, m+4);
  976. SAVE_1x4_VSR(result2, n, m+8);
  977. SAVE_1x4_VSR(result3, n, m+12);
  978. }
  979. }
  980. for (; m < m8; m += 8) {
  981. for (n = 0; n < n16; n += 16) {
  982. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  983. INIT_8ACCS();
  984. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
  985. register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9,
  986. rb10, rb11, rb12, rb13, rb14, rb15;
  987. register vector float t0, t1, t2, t3;
  988. for (k = 0; k < k4; k += 4) {
  989. LOAD_AT_8x4(m, k);
  990. LOAD_BT_16x4(n, k);
  991. KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb8, rb8, rb12, rb12,
  992. ra0, ra4, ra0, ra4, ra0, ra4, ra0, ra4);
  993. KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb9, rb9, rb13, rb13,
  994. ra1, ra5, ra1, ra5, ra1, ra5, ra1, ra5);
  995. KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb10, rb10, rb14, rb14,
  996. ra2, ra6, ra2, ra6, ra2, ra6, ra2, ra6);
  997. KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb11, rb11, rb15, rb15,
  998. ra3, ra7, ra3, ra7, ra3, ra7, ra3, ra7);
  999. }
  1000. for (; k < k2; k += 2) {
  1001. LOAD_AT_8x2(m, k);
  1002. LOAD_BT_16x2(n, k);
  1003. KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb4, rb4, rb6, rb6,
  1004. ra0, ra2, ra0, ra2, ra0, ra2, ra0, ra2);
  1005. KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb5, rb5, rb7, rb7,
  1006. ra1, ra3, ra1, ra3, ra1, ra3, ra1, ra3);
  1007. }
  1008. for (; k < K; k++) {
  1009. LOAD_AT_8x1(m, k);
  1010. LOAD_BT_16x1(n, k);
  1011. KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb2, rb2, rb3, rb3,
  1012. ra0, ra1, ra0, ra1, ra0, ra1, ra0, ra1);
  1013. }
  1014. #if !defined(B0)
  1015. register vector float rc0;
  1016. #endif
  1017. vector float result[4];
  1018. SAVE_4x4_ACC(&acc0, n+0, m+0);
  1019. SAVE_4x4_ACC(&acc1, n+0, m+4);
  1020. SAVE_4x4_ACC(&acc2, n+4, m+0);
  1021. SAVE_4x4_ACC(&acc3, n+4, m+4);
  1022. SAVE_4x4_ACC(&acc4, n+8, m+0);
  1023. SAVE_4x4_ACC(&acc5, n+8, m+4);
  1024. SAVE_4x4_ACC(&acc6, n+12, m+0);
  1025. SAVE_4x4_ACC(&acc7, n+12, m+4);
  1026. }
  1027. for (; n < n8; n += 8) {
  1028. __vector_quad acc0, acc1, acc2, acc3;
  1029. INIT_4ACCS();
  1030. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
  1031. register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
  1032. register vector float t0, t1, t2, t3;
  1033. for (k = 0; k < k4; k += 4) {
  1034. LOAD_AT_8x4(m, k);
  1035. LOAD_BT_8x4(n, k);
  1036. KERNEL_MMA_4ACC(rb0, rb0, rb4, rb4, ra0, ra4, ra0, ra4);
  1037. KERNEL_MMA_4ACC(rb1, rb1, rb5, rb5, ra1, ra5, ra1, ra5);
  1038. KERNEL_MMA_4ACC(rb2, rb2, rb6, rb6, ra2, ra6, ra2, ra6);
  1039. KERNEL_MMA_4ACC(rb3, rb3, rb7, rb7, ra3, ra7, ra3, ra7);
  1040. }
  1041. for (; k < k2; k += 2) {
  1042. LOAD_AT_8x2(m, k);
  1043. LOAD_BT_8x2(n, k);
  1044. KERNEL_MMA_4ACC(rb0, rb0, rb2, rb2, ra0, ra2, ra0, ra2);
  1045. KERNEL_MMA_4ACC(rb1, rb1, rb3, rb3, ra1, ra3, ra1, ra3);
  1046. }
  1047. for (; k < K; k++) {
  1048. LOAD_AT_8x1(m, k);
  1049. LOAD_BT_8x1(n, k);
  1050. KERNEL_MMA_4ACC(rb0, rb0, rb1, rb1, ra0, ra1, ra0, ra1);
  1051. }
  1052. #if !defined(B0)
  1053. register vector float rc0;
  1054. #endif
  1055. vector float result[4];
  1056. SAVE_4x4_ACC(&acc0, n+0, m+0);
  1057. SAVE_4x4_ACC(&acc1, n+0, m+4);
  1058. SAVE_4x4_ACC(&acc2, n+4, m+0);
  1059. SAVE_4x4_ACC(&acc3, n+4, m+4);
  1060. }
  1061. for (; n < n4; n += 4) {
  1062. __vector_quad acc0, acc1;
  1063. INIT_2ACCS();
  1064. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
  1065. register vector float rb0, rb1, rb2, rb3;
  1066. register vector float t0, t1, t2, t3;
  1067. for (k = 0; k < k4; k += 4) {
  1068. LOAD_AT_8x4(m, k);
  1069. LOAD_BT_4x4(n, k);
  1070. KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4);
  1071. KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5);
  1072. KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6);
  1073. KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7);
  1074. }
  1075. for (; k < k2; k += 2) {
  1076. LOAD_AT_8x2(m, k);
  1077. LOAD_BT_4x2(n, k);
  1078. KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2);
  1079. KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3);
  1080. }
  1081. for (; k < K; k++) {
  1082. LOAD_AT_8x1(m, k);
  1083. LOAD_BT_4x1(n, k);
  1084. KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
  1085. }
  1086. #if !defined(B0)
  1087. register vector float rc0;
  1088. #endif
  1089. vector float result[4];
  1090. SAVE_4x4_ACC(&acc0, n+0, m+0);
  1091. SAVE_4x4_ACC(&acc1, n+0, m+4);
  1092. }
  1093. for (; n < n2; n += 2) {
  1094. __vector_quad acc0, acc1;
  1095. INIT_2ACCS();
  1096. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
  1097. register vector float rb0, rb1, rb2, rb3;
  1098. register vector float t0, t1, t2, t3;
  1099. for (k = 0; k < k4; k += 4) {
  1100. LOAD_AT_8x4(m, k);
  1101. LOAD_BT_2x4(n, k);
  1102. KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4);
  1103. KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5);
  1104. KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6);
  1105. KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7);
  1106. }
  1107. for (; k < k2; k += 2) {
  1108. LOAD_AT_8x2(m, k);
  1109. LOAD_BT_2x2(n, k);
  1110. KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2);
  1111. KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3);
  1112. }
  1113. for (; k < K; k++) {
  1114. LOAD_AT_8x1(m, k);
  1115. LOAD_BT_2x1(n, k);
  1116. KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
  1117. }
  1118. #if !defined(B0)
  1119. register vector float rc0;
  1120. #endif
  1121. vector float result[4];
  1122. SAVE_2x4_ACC(&acc0, n, m+0);
  1123. SAVE_2x4_ACC(&acc1, n, m+4);
  1124. }
  1125. for (; n < N; n++) {
  1126. register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
  1127. register vector float rb0;
  1128. register vector float t0, t1, t2, t3;
  1129. vector float result = ((vector float){0.,0.,0.,0.});
  1130. vector float result1 = ((vector float){0.,0.,0.,0.});
  1131. for (k = 0; k < k4; k += 4) {
  1132. LOAD_AT_8x4(m, k);
  1133. LOAD_B_1x1(n, k);
  1134. KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0);
  1135. LOAD_B_1x1(n, k+1);
  1136. KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0);
  1137. LOAD_B_1x1(n, k+2);
  1138. KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0);
  1139. LOAD_B_1x1(n, k+3);
  1140. KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0);
  1141. }
  1142. for (; k < k2; k += 2) {
  1143. LOAD_AT_8x2(m, k);
  1144. LOAD_B_1x1(n, k);
  1145. KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0);
  1146. LOAD_B_1x1(n, k+1);
  1147. KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0);
  1148. }
  1149. for (; k < K; k++) {
  1150. LOAD_AT_8x1(m, k);
  1151. LOAD_B_1x1(n, k);
  1152. KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
  1153. }
  1154. #if !defined(B0)
  1155. register vector float rc0;
  1156. #endif
  1157. SAVE_1x4_VSR(result, n, m);
  1158. SAVE_1x4_VSR(result1, n, m+4);
  1159. }
  1160. }
  1161. for (; m < m4; m += 4) {
  1162. for (n = 0; n < n16; n += 16) {
  1163. __vector_quad acc0, acc1, acc2, acc3;
  1164. INIT_4ACCS();
  1165. register vector float ra0, ra1, ra2, ra3;
  1166. register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9,
  1167. rb10, rb11, rb12, rb13, rb14, rb15;
  1168. register vector float t0, t1, t2, t3;
  1169. for (k = 0; k < k4; k += 4) {
  1170. LOAD_AT_4x4(m, k);
  1171. LOAD_BT_16x4(n, k);
  1172. KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0);
  1173. KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra1, ra1, ra1, ra1);
  1174. KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra2, ra2, ra2, ra2);
  1175. KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra3, ra3, ra3, ra3);
  1176. }
  1177. for (; k < k2; k += 2) {
  1178. LOAD_AT_4x2(m, k);
  1179. LOAD_BT_16x2(n, k);
  1180. KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0);
  1181. KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra1, ra1, ra1, ra1);
  1182. }
  1183. for (; k < K; k++) {
  1184. LOAD_AT_4x1(m, k);
  1185. LOAD_BT_16x1(n, k);
  1186. KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
  1187. }
  1188. #if !defined(B0)
  1189. register vector float rc0;
  1190. #endif
  1191. vector float result[4];
  1192. SAVE_4x4_ACC(&acc0, n+0, m+0);
  1193. SAVE_4x4_ACC(&acc1, n+4, m+0);
  1194. SAVE_4x4_ACC(&acc2, n+8, m+0);
  1195. SAVE_4x4_ACC(&acc3, n+12, m+0);
  1196. }
  1197. for (; n < n8; n += 8) {
  1198. __vector_quad acc0, acc1;
  1199. INIT_2ACCS();
  1200. register vector float ra0, ra1, ra2, ra3;
  1201. register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
  1202. register vector float t0, t1, t2, t3;
  1203. for (k = 0; k < k4; k += 4) {
  1204. LOAD_AT_4x4(m, k);
  1205. LOAD_BT_8x4(n, k);
  1206. KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0);
  1207. KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1);
  1208. KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2);
  1209. KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3);
  1210. }
  1211. for (; k < k2; k += 2) {
  1212. LOAD_AT_4x2(m, k);
  1213. LOAD_BT_8x2(n, k);
  1214. KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0);
  1215. KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1);
  1216. }
  1217. for (; k < K; k++) {
  1218. LOAD_AT_4x1(m, k);
  1219. LOAD_BT_8x1(n, k);
  1220. KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
  1221. }
  1222. #if !defined(B0)
  1223. register vector float rc0;
  1224. #endif
  1225. vector float result[4];
  1226. SAVE_4x4_ACC(&acc0, n+0, m+0);
  1227. SAVE_4x4_ACC(&acc1, n+4, m+0);
  1228. }
  1229. for (; n < n4; n += 4) {
  1230. __vector_quad acc0;
  1231. INIT_1ACC();
  1232. register vector float ra0, ra1, ra2, ra3;
  1233. register vector float rb0, rb1, rb2, rb3;
  1234. register vector float t0, t1, t2, t3;
  1235. for (k = 0; k < k4; k += 4) {
  1236. LOAD_AT_4x4(m, k);
  1237. LOAD_BT_4x4(n, k);
  1238. KERNEL_MMA_1ACC(rb0, ra0);
  1239. KERNEL_MMA_1ACC(rb1, ra1);
  1240. KERNEL_MMA_1ACC(rb2, ra2);
  1241. KERNEL_MMA_1ACC(rb3, ra3);
  1242. }
  1243. for (; k < k2; k += 2) {
  1244. LOAD_AT_4x2(m, k);
  1245. LOAD_BT_4x2(n, k);
  1246. KERNEL_MMA_1ACC(rb0, ra0);
  1247. KERNEL_MMA_1ACC(rb1, ra1);
  1248. }
  1249. for (; k < K; k++) {
  1250. LOAD_AT_4x1(m, k);
  1251. LOAD_BT_4x1(n, k);
  1252. KERNEL_MMA_1ACC(rb0, ra0);
  1253. }
  1254. #if !defined(B0)
  1255. register vector float rc0;
  1256. #endif
  1257. vector float result[4];
  1258. SAVE_4x4_ACC(&acc0, n, m);
  1259. }
  1260. for (; n < n2; n += 2) {
  1261. __vector_quad acc0;
  1262. INIT_1ACC();
  1263. register vector float ra0, ra1, ra2, ra3;
  1264. register vector float rb0, rb1, rb2, rb3;
  1265. register vector float t0, t1, t2, t3;
  1266. for (k = 0; k < k4; k += 4) {
  1267. LOAD_AT_4x4(m, k);
  1268. LOAD_BT_2x4(n, k);
  1269. KERNEL_MMA_1ACC(rb0, ra0);
  1270. KERNEL_MMA_1ACC(rb1, ra1);
  1271. KERNEL_MMA_1ACC(rb2, ra2);
  1272. KERNEL_MMA_1ACC(rb3, ra3);
  1273. }
  1274. for (; k < k2; k += 2) {
  1275. LOAD_AT_4x2(m, k);
  1276. LOAD_BT_2x2(n, k);
  1277. KERNEL_MMA_1ACC(rb0, ra0);
  1278. KERNEL_MMA_1ACC(rb1, ra1);
  1279. }
  1280. for (; k < K; k++) {
  1281. LOAD_AT_4x1(m, k);
  1282. LOAD_BT_2x1(n, k);
  1283. KERNEL_MMA_1ACC(rb0, ra0);
  1284. }
  1285. #if !defined(B0)
  1286. register vector float rc0;
  1287. #endif
  1288. vector float result[4];
  1289. SAVE_2x4_ACC(&acc0, n, m);
  1290. }
  1291. for (; n < N; n++) {
  1292. register vector float ra0, ra1, ra2, ra3;
  1293. register vector float rb0;
  1294. register vector float t0, t1, t2, t3;
  1295. vector float result = ((vector float){0.,0.,0.,0.});
  1296. for (k = 0; k < k4; k += 4) {
  1297. LOAD_AT_4x4(m, k);
  1298. LOAD_B_1x1(n, k);
  1299. KERNEL_VMADD_1VSR(ra0, rb0);
  1300. LOAD_B_1x1(n, k+1);
  1301. KERNEL_VMADD_1VSR(ra1, rb0);
  1302. LOAD_B_1x1(n, k+2);
  1303. KERNEL_VMADD_1VSR(ra2, rb0);
  1304. LOAD_B_1x1(n, k+3);
  1305. KERNEL_VMADD_1VSR(ra3, rb0);
  1306. }
  1307. for (; k < k2; k += 2) {
  1308. LOAD_AT_4x2(m, k);
  1309. LOAD_B_1x1(n, k);
  1310. KERNEL_VMADD_1VSR(ra0, rb0);
  1311. LOAD_B_1x1(n, k+1);
  1312. KERNEL_VMADD_1VSR(ra1, rb0);
  1313. }
  1314. for (; k < K; k++) {
  1315. LOAD_AT_4x1(m, k);
  1316. LOAD_B_1x1(n, k);
  1317. KERNEL_VMADD_1VSR(ra0, rb0);
  1318. }
  1319. #if !defined(B0)
  1320. register vector float rc0;
  1321. #endif
  1322. SAVE_1x4_VSR(result, n, m);
  1323. }
  1324. }
  1325. for (; m < m2; m += 2) {
  1326. for (n = 0; n < n8; n += 8) {
  1327. __vector_quad acc0, acc1;
  1328. INIT_2ACCS();
  1329. register vector float ra0, ra1, ra2, ra3;
  1330. register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
  1331. register vector float t0, t1, t2, t3;
  1332. for (k = 0; k < k4; k += 4) {
  1333. LOAD_AT_2x4(m, k);
  1334. LOAD_BT_8x4(n, k);
  1335. KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0);
  1336. KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1);
  1337. KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2);
  1338. KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3);
  1339. }
  1340. for (; k < k2; k += 2) {
  1341. LOAD_AT_2x2(m, k);
  1342. LOAD_BT_8x2(n, k);
  1343. KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0);
  1344. KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1);
  1345. }
  1346. for (; k < K; k++) {
  1347. LOAD_AT_2x1(m, k);
  1348. LOAD_BT_8x1(n, k);
  1349. KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
  1350. }
  1351. #if !defined(B0)
  1352. register vector float rc0;
  1353. #endif
  1354. vector float result[4];
  1355. SAVE_4x2_ACC(&acc0, n+0, m+0);
  1356. SAVE_4x2_ACC(&acc1, n+4, m+0);
  1357. }
  1358. for (; n < n4; n += 4) {
  1359. __vector_quad acc0;
  1360. INIT_1ACC();
  1361. register vector float ra0, ra1, ra2, ra3;
  1362. register vector float rb0, rb1, rb2, rb3;
  1363. register vector float t0, t1, t2, t3;
  1364. for (k = 0; k < k4; k += 4) {
  1365. LOAD_AT_2x4(m, k);
  1366. LOAD_BT_4x4(n, k);
  1367. KERNEL_MMA_1ACC(rb0, ra0);
  1368. KERNEL_MMA_1ACC(rb1, ra1);
  1369. KERNEL_MMA_1ACC(rb2, ra2);
  1370. KERNEL_MMA_1ACC(rb3, ra3);
  1371. }
  1372. for (; k < k2; k += 2) {
  1373. LOAD_AT_2x2(m, k);
  1374. LOAD_BT_4x2(n, k);
  1375. KERNEL_MMA_1ACC(rb0, ra0);
  1376. KERNEL_MMA_1ACC(rb1, ra1);
  1377. }
  1378. for (; k < K; k++) {
  1379. LOAD_AT_2x1(m, k);
  1380. LOAD_BT_4x1(n, k);
  1381. KERNEL_MMA_1ACC(rb0, ra0);
  1382. }
  1383. #if !defined(B0)
  1384. register vector float rc0;
  1385. #endif
  1386. vector float result[4];
  1387. SAVE_4x2_ACC(&acc0, n, m);
  1388. }
  1389. for (; n < n2; n += 2) {
  1390. vector float result = ((vector float){0.,0.,0.,0.});
  1391. register vector float ra0;
  1392. register vector float rb0;
  1393. for (k = 0; k < K; k++) {
  1394. LOAD_A_2x2(m, k);
  1395. LOAD_B_2x2(n, k);
  1396. KERNEL_VMADD_1VSR(ra0, rb0);
  1397. }
  1398. #if !defined(B0)
  1399. register vector float rc0;
  1400. #endif
  1401. SAVE_2x2_VSR(result, n, m);
  1402. }
  1403. for (; n < N; n++) {
  1404. vector float result = ((vector float){0.,0.,0.,0.});
  1405. register vector float ra0 = ((vector float){0.,0.,0.,0.});
  1406. register vector float rb0;
  1407. for (k = 0; k < K; k++) {
  1408. LOAD_A_2x1(m, k);
  1409. LOAD_B_1x1(n, k);
  1410. KERNEL_VMADD_1VSR(ra0, rb0);
  1411. }
  1412. #if !defined(B0)
  1413. register vector float rc0;
  1414. #endif
  1415. SAVE_1x2_VSR(result, n, m);
  1416. }
  1417. }
  1418. for (; m < M; m++) {
  1419. for (n = 0; n < n8; n += 8) {
  1420. vector float result = ((vector float){0.,0.,0.,0.});
  1421. vector float result1 = ((vector float){0.,0.,0.,0.});
  1422. register vector float ra0;
  1423. register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
  1424. register vector float t0, t1, t2, t3;
  1425. for (k = 0; k < k4; k += 4) {
  1426. LOAD_A_1x1(m, k);
  1427. LOAD_BT_8x4(n, k);
  1428. KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4);
  1429. LOAD_A_1x1(m, k+1);
  1430. KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5);
  1431. LOAD_A_1x1(m, k+2);
  1432. KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6);
  1433. LOAD_A_1x1(m, k+3);
  1434. KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7);
  1435. }
  1436. for (; k < k2; k += 2) {
  1437. LOAD_A_1x1(m, k);
  1438. LOAD_BT_8x2(n, k);
  1439. KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2);
  1440. LOAD_A_1x1(m, k+1);
  1441. KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3);
  1442. }
  1443. for (; k < K; k++) {
  1444. LOAD_A_1x1(m, k);
  1445. LOAD_BT_8x1(n, k);
  1446. KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
  1447. }
  1448. SAVE_4x1_VSR(result, n, m);
  1449. SAVE_4x1_VSR(result1, n+4, m);
  1450. }
  1451. for (; n < n4; n += 4) {
  1452. vector float result = ((vector float){0.,0.,0.,0.});
  1453. register vector float ra0;
  1454. register vector float rb0, rb1, rb2, rb3;
  1455. register vector float t0, t1, t2, t3;
  1456. for (k = 0; k < k4; k += 4) {
  1457. LOAD_A_1x1(m, k);
  1458. LOAD_BT_4x4(n, k);
  1459. KERNEL_VMADD_1VSR(ra0, rb0);
  1460. LOAD_A_1x1(m, k+1);
  1461. KERNEL_VMADD_1VSR(ra0, rb1);
  1462. LOAD_A_1x1(m, k+2);
  1463. KERNEL_VMADD_1VSR(ra0, rb2);
  1464. LOAD_A_1x1(m, k+3);
  1465. KERNEL_VMADD_1VSR(ra0, rb3);
  1466. }
  1467. for (; k < k2; k += 2) {
  1468. LOAD_A_1x1(m, k);
  1469. LOAD_BT_4x2(n, k);
  1470. KERNEL_VMADD_1VSR(ra0, rb0);
  1471. LOAD_A_1x1(m, k+1);
  1472. KERNEL_VMADD_1VSR(ra0, rb1);
  1473. }
  1474. for (; k < K; k++) {
  1475. LOAD_A_1x1(m, k);
  1476. LOAD_BT_4x1(n, k);
  1477. KERNEL_VMADD_1VSR(ra0, rb0);
  1478. }
  1479. SAVE_4x1_VSR(result, n, m);
  1480. }
  1481. for (; n < n2; n += 2) {
  1482. vector float result = ((vector float){0.,0.,0.,0.});
  1483. register vector float ra0;
  1484. register vector float rb0 = ((vector float){0.,0.,0.,0.});
  1485. for (k = 0; k < K; k++) {
  1486. LOAD_A_1x1(m, k);
  1487. LOAD_B_2x1(n, k);
  1488. KERNEL_VMADD_1VSR(ra0, rb0);
  1489. }
  1490. SAVE_2x1_VSR(result, n, m);
  1491. }
  1492. for (; n < N; n++) {
  1493. FLOAT result = 0.0f;
  1494. for (k = 0; k < K; k++) {
  1495. result += A[m*lda+k] * B[n*ldb+k];
  1496. }
  1497. result = result * alpha;
  1498. #if !defined(B0)
  1499. C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
  1500. #else
  1501. C[n*ldc+m] = result;
  1502. #endif
  1503. }
  1504. }
  1505. if (has_packing) free (packA);
  1506. return 0;
  1507. }