You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

shgemm_kernel_16x8_zvl256b.c 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969
  1. #include "common.h"
  2. #include <riscv_vector.h>
  3. int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc)
  4. {
  5. BLASLONG gvl = 0;
  6. BLASLONG m_top = 0;
  7. BLASLONG n_top = 0;
  8. // -- MAIN PASS
  9. for (BLASLONG j=0; j<N/8; j+=1) {
  10. m_top = 0;
  11. BLASLONG gvl = __riscv_vsetvl_e16m1(16);
  12. for (BLASLONG i=0; i<M/16; i+=1) {
  13. BLASLONG ai=m_top*K;
  14. BLASLONG bi=n_top*K;
  15. _Float16 B0 = B[bi+0];
  16. _Float16 B1 = B[bi+1];
  17. _Float16 B2 = B[bi+2];
  18. _Float16 B3 = B[bi+3];
  19. _Float16 B4 = B[bi+4];
  20. _Float16 B5 = B[bi+5];
  21. _Float16 B6 = B[bi+6];
  22. _Float16 B7 = B[bi+7];
  23. bi += 8;
  24. vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
  25. ai += 16;
  26. vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
  27. vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
  28. vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
  29. vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
  30. vfloat32m2_t result4 = __riscv_vfwmul_vf_f32m2( A0, B4, gvl);
  31. vfloat32m2_t result5 = __riscv_vfwmul_vf_f32m2( A0, B5, gvl);
  32. vfloat32m2_t result6 = __riscv_vfwmul_vf_f32m2( A0, B6, gvl);
  33. vfloat32m2_t result7 = __riscv_vfwmul_vf_f32m2( A0, B7, gvl);
  34. for(BLASLONG k=1; k<K; k++) {
  35. B0 = B[bi+0];
  36. B1 = B[bi+1];
  37. B2 = B[bi+2];
  38. B3 = B[bi+3];
  39. B4 = B[bi+4];
  40. B5 = B[bi+5];
  41. B6 = B[bi+6];
  42. B7 = B[bi+7];
  43. bi += 8;
  44. A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
  45. ai += 16;
  46. result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
  47. result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
  48. result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
  49. result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
  50. result4 = __riscv_vfwmacc_vf_f32m2(result4, B4, A0, gvl);
  51. result5 = __riscv_vfwmacc_vf_f32m2(result5, B5, A0, gvl);
  52. result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
  53. result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
  54. }
  55. BLASLONG ci=n_top*ldc+m_top;
  56. vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  57. vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  58. vfloat32m2_t c2 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  59. vfloat32m2_t c3 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  60. vfloat32m2_t c4 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  61. vfloat32m2_t c5 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  62. vfloat32m2_t c6 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  63. vfloat32m2_t c7 = __riscv_vle32_v_f32m2( &C[ci], gvl);
  64. c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
  65. c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
  66. c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
  67. c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
  68. c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
  69. c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
  70. c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
  71. c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
  72. ci=n_top*ldc+m_top;
  73. __riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
  74. __riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
  75. __riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
  76. __riscv_vse32_v_f32m2( &C[ci], c3, gvl); ci += ldc-gvl*0;
  77. __riscv_vse32_v_f32m2( &C[ci], c4, gvl); ci += ldc-gvl*0;
  78. __riscv_vse32_v_f32m2( &C[ci], c5, gvl); ci += ldc-gvl*0;
  79. __riscv_vse32_v_f32m2( &C[ci], c6, gvl); ci += ldc-gvl*0;
  80. __riscv_vse32_v_f32m2( &C[ci], c7, gvl);
  81. m_top += 16;
  82. }
  83. // -- tails for main pass
  84. if( M & 8 ) {
  85. gvl = __riscv_vsetvl_e16mf2(8);
  86. BLASLONG ai=m_top*K;
  87. BLASLONG bi=n_top*K;
  88. _Float16 B0 = B[bi+0];
  89. _Float16 B1 = B[bi+1];
  90. _Float16 B2 = B[bi+2];
  91. _Float16 B3 = B[bi+3];
  92. _Float16 B4 = B[bi+4];
  93. _Float16 B5 = B[bi+5];
  94. _Float16 B6 = B[bi+6];
  95. _Float16 B7 = B[bi+7];
  96. bi += 8;
  97. vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  98. ai += 8;
  99. vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
  100. vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
  101. vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
  102. vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
  103. vfloat32m1_t result4 = __riscv_vfwmul_vf_f32m1( A0, B4, gvl);
  104. vfloat32m1_t result5 = __riscv_vfwmul_vf_f32m1( A0, B5, gvl);
  105. vfloat32m1_t result6 = __riscv_vfwmul_vf_f32m1( A0, B6, gvl);
  106. vfloat32m1_t result7 = __riscv_vfwmul_vf_f32m1( A0, B7, gvl);
  107. for(BLASLONG k=1; k<K; k++) {
  108. B0 = B[bi+0];
  109. B1 = B[bi+1];
  110. B2 = B[bi+2];
  111. B3 = B[bi+3];
  112. B4 = B[bi+4];
  113. B5 = B[bi+5];
  114. B6 = B[bi+6];
  115. B7 = B[bi+7];
  116. bi += 8;
  117. A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  118. ai += 8;
  119. result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
  120. result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
  121. result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
  122. result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
  123. result4 = __riscv_vfwmacc_vf_f32m1(result4, B4, A0, gvl);
  124. result5 = __riscv_vfwmacc_vf_f32m1(result5, B5, A0, gvl);
  125. result6 = __riscv_vfwmacc_vf_f32m1(result6, B6, A0, gvl);
  126. result7 = __riscv_vfwmacc_vf_f32m1(result7, B7, A0, gvl);
  127. }
  128. BLASLONG ci=n_top*ldc+m_top;
  129. vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
  130. vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
  131. vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
  132. vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
  133. vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
  134. vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
  135. vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
  136. vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
  137. c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
  138. c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
  139. c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
  140. c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
  141. c4 = __riscv_vfmacc_vf_f32m1(c4, alpha, result4, gvl);
  142. c5 = __riscv_vfmacc_vf_f32m1(c5, alpha, result5, gvl);
  143. c6 = __riscv_vfmacc_vf_f32m1(c6, alpha, result6, gvl);
  144. c7 = __riscv_vfmacc_vf_f32m1(c7, alpha, result7, gvl);
  145. ci=n_top*ldc+m_top;
  146. __riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
  147. __riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
  148. __riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
  149. __riscv_vse32_v_f32m1(&C[ci], c3, gvl); ci += ldc - gvl * 0;
  150. __riscv_vse32_v_f32m1(&C[ci], c4, gvl); ci += ldc - gvl * 0;
  151. __riscv_vse32_v_f32m1(&C[ci], c5, gvl); ci += ldc - gvl * 0;
  152. __riscv_vse32_v_f32m1(&C[ci], c6, gvl); ci += ldc - gvl * 0;
  153. __riscv_vse32_v_f32m1(&C[ci], c7, gvl);
  154. m_top += 8;
  155. }
  156. if( M & 4 ) {
  157. gvl = __riscv_vsetvl_e16mf2(4);
  158. BLASLONG ai=m_top*K;
  159. BLASLONG bi=n_top*K;
  160. _Float16 B0 = B[bi+0];
  161. _Float16 B1 = B[bi+1];
  162. _Float16 B2 = B[bi+2];
  163. _Float16 B3 = B[bi+3];
  164. _Float16 B4 = B[bi+4];
  165. _Float16 B5 = B[bi+5];
  166. _Float16 B6 = B[bi+6];
  167. _Float16 B7 = B[bi+7];
  168. bi += 8;
  169. vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  170. ai += 4;
  171. vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
  172. vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
  173. vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
  174. vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
  175. vfloat32m1_t result4 = __riscv_vfwmul_vf_f32m1( A0, B4, gvl);
  176. vfloat32m1_t result5 = __riscv_vfwmul_vf_f32m1( A0, B5, gvl);
  177. vfloat32m1_t result6 = __riscv_vfwmul_vf_f32m1( A0, B6, gvl);
  178. vfloat32m1_t result7 = __riscv_vfwmul_vf_f32m1( A0, B7, gvl);
  179. for(BLASLONG k=1; k < K; ++k) {
  180. B0 = B[bi+0];
  181. B1 = B[bi+1];
  182. B2 = B[bi+2];
  183. B3 = B[bi+3];
  184. B4 = B[bi+4];
  185. B5 = B[bi+5];
  186. B6 = B[bi+6];
  187. B7 = B[bi+7];
  188. bi += 8;
  189. A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  190. ai += 4;
  191. result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
  192. result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
  193. result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
  194. result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
  195. result4 = __riscv_vfwmacc_vf_f32m1(result4, B4, A0, gvl);
  196. result5 = __riscv_vfwmacc_vf_f32m1(result5, B5, A0, gvl);
  197. result6 = __riscv_vfwmacc_vf_f32m1(result6, B6, A0, gvl);
  198. result7 = __riscv_vfwmacc_vf_f32m1(result7, B7, A0, gvl);
  199. }
  200. BLASLONG ci = n_top * ldc + m_top;
  201. vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  202. ci += ldc - gvl * 0;
  203. vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  204. ci += ldc - gvl * 0;
  205. vfloat32m1_t c2 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  206. ci += ldc - gvl * 0;
  207. vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  208. ci += ldc - gvl * 0;
  209. vfloat32m1_t c4 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  210. ci += ldc - gvl * 0;
  211. vfloat32m1_t c5 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  212. ci += ldc - gvl * 0;
  213. vfloat32m1_t c6 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  214. ci += ldc - gvl * 0;
  215. vfloat32m1_t c7 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  216. c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
  217. c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
  218. c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
  219. c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
  220. c4 = __riscv_vfmacc_vf_f32m1(c4, alpha, result4, gvl);
  221. c5 = __riscv_vfmacc_vf_f32m1(c5, alpha, result5, gvl);
  222. c6 = __riscv_vfmacc_vf_f32m1(c6, alpha, result6, gvl);
  223. c7 = __riscv_vfmacc_vf_f32m1(c7, alpha, result7, gvl);
  224. ci= n_top * ldc + m_top;
  225. __riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
  226. __riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
  227. __riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
  228. __riscv_vse32_v_f32m1(&C[ci], c3, gvl); ci += ldc - gvl * 0;
  229. __riscv_vse32_v_f32m1(&C[ci], c4, gvl); ci += ldc - gvl * 0;
  230. __riscv_vse32_v_f32m1(&C[ci], c5, gvl); ci += ldc - gvl * 0;
  231. __riscv_vse32_v_f32m1(&C[ci], c6, gvl); ci += ldc - gvl * 0;
  232. __riscv_vse32_v_f32m1(&C[ci], c7, gvl);
  233. m_top += 4;
  234. }
  235. if( M & 2 ) {
  236. float result0 = 0;
  237. float result1 = 0;
  238. float result2 = 0;
  239. float result3 = 0;
  240. float result4 = 0;
  241. float result5 = 0;
  242. float result6 = 0;
  243. float result7 = 0;
  244. float result8 = 0;
  245. float result9 = 0;
  246. float result10 = 0;
  247. float result11 = 0;
  248. float result12 = 0;
  249. float result13 = 0;
  250. float result14 = 0;
  251. float result15 = 0;
  252. BLASLONG ai = m_top * K;
  253. BLASLONG bi = n_top * K;
  254. for(BLASLONG k=0; k<K; k++) {
  255. result0+=(float)(A[ai+0]*B[bi+0]);
  256. result1+=(float)(A[ai+1]*B[bi+0]);
  257. result2+=(float)(A[ai+0]*B[bi+1]);
  258. result3+=(float)(A[ai+1]*B[bi+1]);
  259. result4+=(float)(A[ai+0]*B[bi+2]);
  260. result5+=(float)(A[ai+1]*B[bi+2]);
  261. result6+=(float)(A[ai+0]*B[bi+3]);
  262. result7+=(float)(A[ai+1]*B[bi+3]);
  263. result8+=(float)(A[ai+0]*B[bi+4]);
  264. result9+=(float)(A[ai+1]*B[bi+4]);
  265. result10+=(float)(A[ai+0]*B[bi+5]);
  266. result11+=(float)(A[ai+1]*B[bi+5]);
  267. result12+=(float)(A[ai+0]*B[bi+6]);
  268. result13+=(float)(A[ai+1]*B[bi+6]);
  269. result14+=(float)(A[ai+0]*B[bi+7]);
  270. result15+=(float)(A[ai+1]*B[bi+7]);
  271. ai+=2;
  272. bi+=8;
  273. }
  274. BLASLONG ci=n_top*ldc+m_top;
  275. C[ci + 0 * ldc + 0] += alpha * result0;
  276. C[ci + 0 * ldc + 1] += alpha * result1;
  277. C[ci + 1 * ldc + 0] += alpha * result2;
  278. C[ci + 1 * ldc + 1] += alpha * result3;
  279. C[ci + 2 * ldc + 0] += alpha * result4;
  280. C[ci + 2 * ldc + 1] += alpha * result5;
  281. C[ci + 3 * ldc + 0] += alpha * result6;
  282. C[ci + 3 * ldc + 1] += alpha * result7;
  283. C[ci + 4 * ldc + 0] += alpha * result8;
  284. C[ci + 4 * ldc + 1] += alpha * result9;
  285. C[ci + 5 * ldc + 0] += alpha * result10;
  286. C[ci + 5 * ldc + 1] += alpha * result11;
  287. C[ci + 6 * ldc + 0] += alpha * result12;
  288. C[ci + 6 * ldc + 1] += alpha * result13;
  289. C[ci + 7 * ldc + 0] += alpha * result14;
  290. C[ci + 7 * ldc + 1] += alpha * result15;
  291. m_top+=2;
  292. }
  293. if( M & 1 ) {
  294. float result0 = 0;
  295. float result1 = 0;
  296. float result2 = 0;
  297. float result3 = 0;
  298. float result4 = 0;
  299. float result5 = 0;
  300. float result6 = 0;
  301. float result7 = 0;
  302. BLASLONG ai = m_top * K;
  303. BLASLONG bi = n_top * K;
  304. for(BLASLONG k=0; k<K; k++) {
  305. result0+=(float)(A[ai+0]*B[bi+0]);
  306. result1+=(float)(A[ai+0]*B[bi+1]);
  307. result2+=(float)(A[ai+0]*B[bi+2]);
  308. result3+=(float)(A[ai+0]*B[bi+3]);
  309. result4+=(float)(A[ai+0]*B[bi+4]);
  310. result5+=(float)(A[ai+0]*B[bi+5]);
  311. result6+=(float)(A[ai+0]*B[bi+6]);
  312. result7+=(float)(A[ai+0]*B[bi+7]);
  313. ai+=1;
  314. bi+=8;
  315. }
  316. BLASLONG ci = n_top * ldc + m_top;
  317. C[ci + 0 * ldc + 0] += alpha * result0;
  318. C[ci + 1 * ldc + 0] += alpha * result1;
  319. C[ci + 2 * ldc + 0] += alpha * result2;
  320. C[ci + 3 * ldc + 0] += alpha * result3;
  321. C[ci + 4 * ldc + 0] += alpha * result4;
  322. C[ci + 5 * ldc + 0] += alpha * result5;
  323. C[ci + 6 * ldc + 0] += alpha * result6;
  324. C[ci + 7 * ldc + 0] += alpha * result7;
  325. m_top+=1;
  326. }
  327. n_top += 8;
  328. }
  329. if( N & 4 ) {
  330. gvl = __riscv_vsetvl_e16m1(16);
  331. m_top = 0;
  332. for (BLASLONG i=0; i<M/16; i+=1) {
  333. BLASLONG ai=m_top*K;
  334. BLASLONG bi=n_top*K;
  335. _Float16 B0 = B[bi+0];
  336. _Float16 B1 = B[bi+1];
  337. _Float16 B2 = B[bi+2];
  338. _Float16 B3 = B[bi+3];
  339. bi += 4;
  340. vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
  341. ai += 16;
  342. vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
  343. vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
  344. vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl);
  345. vfloat32m2_t result3 = __riscv_vfwmul_vf_f32m2( A0, B3, gvl);
  346. for(BLASLONG k=1; k<K; k++) {
  347. B0 = B[bi+0];
  348. B1 = B[bi+1];
  349. B2 = B[bi+2];
  350. B3 = B[bi+3];
  351. bi += 4;
  352. A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
  353. ai += 16;
  354. result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
  355. result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
  356. result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
  357. result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
  358. }
  359. BLASLONG ci=n_top*ldc+m_top;
  360. vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  361. vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  362. vfloat32m2_t c2 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  363. vfloat32m2_t c3 = __riscv_vle32_v_f32m2( &C[ci], gvl);
  364. c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
  365. c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
  366. c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
  367. c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
  368. ci=n_top*ldc+m_top;
  369. __riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
  370. __riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
  371. __riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
  372. __riscv_vse32_v_f32m2( &C[ci], c3, gvl);
  373. m_top += 16;
  374. }
  375. if( M & 8 ) {
  376. gvl = __riscv_vsetvl_e16mf2(8);
  377. BLASLONG ai=m_top*K;
  378. BLASLONG bi=n_top*K;
  379. _Float16 B0 = B[bi+0];
  380. _Float16 B1 = B[bi+1];
  381. _Float16 B2 = B[bi+2];
  382. _Float16 B3 = B[bi+3];
  383. bi += 4;
  384. vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  385. ai += 8;
  386. vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
  387. vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
  388. vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
  389. vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
  390. for(BLASLONG k=1; k<K; k++) {
  391. B0 = B[bi+0];
  392. B1 = B[bi+1];
  393. B2 = B[bi+2];
  394. B3 = B[bi+3];
  395. bi += 4;
  396. A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  397. ai += 8;
  398. result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
  399. result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
  400. result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
  401. result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
  402. }
  403. BLASLONG ci=n_top*ldc+m_top;
  404. vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc - gvl * 0;
  405. vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc - gvl * 0;
  406. vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc - gvl * 0;
  407. vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
  408. c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
  409. c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
  410. c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
  411. c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
  412. ci = n_top * ldc + m_top;
  413. __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
  414. __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
  415. __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
  416. __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
  417. m_top += 8;
  418. }
  419. if( M & 4 ) {
  420. gvl = __riscv_vsetvl_e16mf2(4);
  421. BLASLONG ai=m_top*K;
  422. BLASLONG bi=n_top*K;
  423. _Float16 B0 = B[bi+0];
  424. _Float16 B1 = B[bi+1];
  425. _Float16 B2 = B[bi+2];
  426. _Float16 B3 = B[bi+3];
  427. bi += 4;
  428. vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  429. ai += 4;
  430. vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
  431. vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
  432. vfloat32m1_t result2 = __riscv_vfwmul_vf_f32m1( A0, B2, gvl);
  433. vfloat32m1_t result3 = __riscv_vfwmul_vf_f32m1( A0, B3, gvl);
  434. for(BLASLONG k=1; k < K; ++k) {
  435. B0 = B[bi+0];
  436. B1 = B[bi+1];
  437. B2 = B[bi+2];
  438. B3 = B[bi+3];
  439. bi += 4;
  440. A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  441. ai += 4;
  442. result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
  443. result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
  444. result2 = __riscv_vfwmacc_vf_f32m1(result2, B2, A0, gvl);
  445. result3 = __riscv_vfwmacc_vf_f32m1(result3, B3, A0, gvl);
  446. }
  447. BLASLONG ci = n_top * ldc + m_top;
  448. vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  449. ci += ldc - gvl * 0;
  450. vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  451. ci += ldc - gvl * 0;
  452. vfloat32m1_t c2 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  453. ci += ldc - gvl * 0;
  454. vfloat32m1_t c3 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  455. c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
  456. c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
  457. c2 = __riscv_vfmacc_vf_f32m1(c2, alpha, result2, gvl);
  458. c3 = __riscv_vfmacc_vf_f32m1(c3, alpha, result3, gvl);
  459. ci= n_top * ldc + m_top;
  460. __riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
  461. __riscv_vse32_v_f32m1(&C[ci], c1, gvl); ci += ldc - gvl * 0;
  462. __riscv_vse32_v_f32m1(&C[ci], c2, gvl); ci += ldc - gvl * 0;
  463. __riscv_vse32_v_f32m1(&C[ci], c3, gvl);
  464. m_top += 4;
  465. }
  466. if( M & 2 ) {
  467. float result0 = 0;
  468. float result1 = 0;
  469. float result2 = 0;
  470. float result3 = 0;
  471. float result4 = 0;
  472. float result5 = 0;
  473. float result6 = 0;
  474. float result7 = 0;
  475. BLASLONG ai = m_top * K;
  476. BLASLONG bi = n_top * K;
  477. for(BLASLONG k=0; k<K; k++) {
  478. result0+=(float)(A[ai+0]*B[bi+0]);
  479. result1+=(float)(A[ai+1]*B[bi+0]);
  480. result2+=(float)(A[ai+0]*B[bi+1]);
  481. result3+=(float)(A[ai+1]*B[bi+1]);
  482. result4+=(float)(A[ai+0]*B[bi+2]);
  483. result5+=(float)(A[ai+1]*B[bi+2]);
  484. result6+=(float)(A[ai+0]*B[bi+3]);
  485. result7+=(float)(A[ai+1]*B[bi+3]);
  486. ai+=2;
  487. bi+=4;
  488. }
  489. BLASLONG ci=n_top*ldc+m_top;
  490. C[ci + 0 * ldc + 0] += alpha * result0;
  491. C[ci + 0 * ldc + 1] += alpha * result1;
  492. C[ci + 1 * ldc + 0] += alpha * result2;
  493. C[ci + 1 * ldc + 1] += alpha * result3;
  494. C[ci + 2 * ldc + 0] += alpha * result4;
  495. C[ci + 2 * ldc + 1] += alpha * result5;
  496. C[ci + 3 * ldc + 0] += alpha * result6;
  497. C[ci + 3 * ldc + 1] += alpha * result7;
  498. m_top += 2;
  499. }
  500. if( M & 1 ) {
  501. float result0 = 0;
  502. float result1 = 0;
  503. float result2 = 0;
  504. float result3 = 0;
  505. BLASLONG ai = m_top * K;
  506. BLASLONG bi = n_top * K;
  507. for(BLASLONG k=0; k<K; k++) {
  508. result0+=(float)(A[ai+0]*B[bi+0]);
  509. result1+=(float)(A[ai+0]*B[bi+1]);
  510. result2+=(float)(A[ai+0]*B[bi+2]);
  511. result3+=(float)(A[ai+0]*B[bi+3]);
  512. ai+=1;
  513. bi+=4;
  514. }
  515. BLASLONG ci = n_top * ldc + m_top;
  516. C[ci + 0 * ldc + 0] += alpha * result0;
  517. C[ci + 1 * ldc + 0] += alpha * result1;
  518. C[ci + 2 * ldc + 0] += alpha * result2;
  519. C[ci + 3 * ldc + 0] += alpha * result3;
  520. m_top += 1;
  521. }
  522. n_top += 4;
  523. }
  524. // -- tails for N=2
  525. if( N & 2 ) {
  526. gvl = __riscv_vsetvl_e16m1(16);
  527. m_top = 0;
  528. for (BLASLONG i=0; i<M/16; i+=1) {
  529. BLASLONG ai=m_top*K;
  530. BLASLONG bi=n_top*K;
  531. _Float16 B0 = B[bi+0];
  532. _Float16 B1 = B[bi+1];
  533. bi += 2;
  534. vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
  535. ai += 16;
  536. vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
  537. vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl);
  538. for(BLASLONG k=1; k<K; k++) {
  539. B0 = B[bi+0];
  540. B1 = B[bi+1];
  541. bi += 2;
  542. A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
  543. ai += 16;
  544. result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
  545. result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
  546. }
  547. BLASLONG ci=n_top*ldc+m_top;
  548. vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl); ci += ldc-gvl*0;
  549. vfloat32m2_t c1 = __riscv_vle32_v_f32m2( &C[ci], gvl);
  550. c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
  551. c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
  552. ci=n_top*ldc+m_top;
  553. __riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
  554. __riscv_vse32_v_f32m2( &C[ci], c1, gvl);
  555. m_top += 16;
  556. }
  557. if( M & 8 ) {
  558. gvl = __riscv_vsetvl_e16mf2(8);
  559. BLASLONG ai=m_top*K;
  560. BLASLONG bi=n_top*K;
  561. _Float16 B0 = B[bi+0];
  562. _Float16 B1 = B[bi+1];
  563. bi += 2;
  564. vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  565. ai += 8;
  566. vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
  567. vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
  568. for(BLASLONG k=1; k<K; k++) {
  569. B0 = B[bi+0];
  570. B1 = B[bi+1];
  571. bi += 2;
  572. A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  573. ai += 8;
  574. result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
  575. result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
  576. }
  577. BLASLONG ci=n_top*ldc+m_top;
  578. vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc - gvl * 0;
  579. vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
  580. c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
  581. c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
  582. ci = n_top * ldc + m_top;
  583. __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
  584. __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
  585. m_top += 8;
  586. }
  587. if( M & 4 ) {
  588. gvl = __riscv_vsetvl_e16mf2(4);
  589. BLASLONG ai=m_top*K;
  590. BLASLONG bi=n_top*K;
  591. _Float16 B0 = B[bi+0];
  592. _Float16 B1 = B[bi+1];
  593. bi += 2;
  594. vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  595. ai += 4;
  596. vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
  597. vfloat32m1_t result1 = __riscv_vfwmul_vf_f32m1( A0, B1, gvl);
  598. for(BLASLONG k=1; k < K; ++k) {
  599. B0 = B[bi+0];
  600. B1 = B[bi+1];
  601. bi += 2;
  602. A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  603. ai += 4;
  604. result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
  605. result1 = __riscv_vfwmacc_vf_f32m1(result1, B1, A0, gvl);
  606. }
  607. BLASLONG ci = n_top * ldc + m_top;
  608. vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  609. ci += ldc - gvl * 0;
  610. vfloat32m1_t c1 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  611. c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
  612. c1 = __riscv_vfmacc_vf_f32m1(c1, alpha, result1, gvl);
  613. ci= n_top * ldc + m_top;
  614. __riscv_vse32_v_f32m1(&C[ci], c0, gvl); ci += ldc - gvl * 0;
  615. __riscv_vse32_v_f32m1(&C[ci], c1, gvl);
  616. m_top += 4;
  617. }
  618. if( M & 2 ) {
  619. float result0 = 0;
  620. float result1 = 0;
  621. float result2 = 0;
  622. float result3 = 0;
  623. BLASLONG ai = m_top * K;
  624. BLASLONG bi = n_top * K;
  625. for(BLASLONG k=0; k<K; k++) {
  626. result0+=(float)(A[ai+0]*B[bi+0]);
  627. result1+=(float)(A[ai+1]*B[bi+0]);
  628. result2+=(float)(A[ai+0]*B[bi+1]);
  629. result3+=(float)(A[ai+1]*B[bi+1]);
  630. ai+=2;
  631. bi+=2;
  632. }
  633. BLASLONG ci=n_top*ldc+m_top;
  634. C[ci + 0 * ldc + 0] += alpha * result0;
  635. C[ci + 0 * ldc + 1] += alpha * result1;
  636. C[ci + 1 * ldc + 0] += alpha * result2;
  637. C[ci + 1 * ldc + 1] += alpha * result3;
  638. m_top += 2;
  639. }
  640. if( M & 1 ) {
  641. float result0 = 0;
  642. float result1 = 0;
  643. BLASLONG ai = m_top * K;
  644. BLASLONG bi = n_top * K;
  645. for(BLASLONG k=0; k<K; k++) {
  646. result0+=(float)(A[ai+0]*B[bi+0]);
  647. result1+=(float)(A[ai+0]*B[bi+1]);
  648. ai+=1;
  649. bi+=2;
  650. }
  651. BLASLONG ci = n_top * ldc + m_top;
  652. C[ci + 0 * ldc + 0] += alpha * result0;
  653. C[ci + 1 * ldc + 0] += alpha * result1;
  654. m_top += 1;
  655. }
  656. n_top += 2;
  657. }
  658. // -- tails for N=1
  659. if( N & 1 ) {
  660. gvl = __riscv_vsetvl_e16m1(16);
  661. m_top = 0;
  662. for (BLASLONG i=0; i<M/16; i+=1) {
  663. BLASLONG ai=m_top*K;
  664. BLASLONG bi=n_top*K;
  665. _Float16 B0 = B[bi+0];
  666. bi += 1;
  667. vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
  668. ai += 16;
  669. vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl);
  670. for(BLASLONG k=1; k<K; k++) {
  671. B0 = B[bi+0];
  672. bi += 1;
  673. A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
  674. ai += 16;
  675. result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
  676. }
  677. BLASLONG ci=n_top*ldc+m_top;
  678. vfloat32m2_t c0 = __riscv_vle32_v_f32m2( &C[ci], gvl);
  679. c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
  680. ci=n_top*ldc+m_top;
  681. __riscv_vse32_v_f32m2( &C[ci], c0, gvl);
  682. m_top += 16;
  683. }
  684. if( M & 8 ) {
  685. gvl = __riscv_vsetvl_e16mf2(8);
  686. BLASLONG ai=m_top*K;
  687. BLASLONG bi=n_top*K;
  688. _Float16 B0 = B[bi+0];
  689. bi += 1;
  690. vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  691. ai += 8;
  692. vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
  693. for(BLASLONG k=1; k<K; k++) {
  694. B0 = B[bi+0];
  695. bi += 1;
  696. A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  697. ai += 8;
  698. result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
  699. }
  700. BLASLONG ci=n_top*ldc+m_top;
  701. vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
  702. c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
  703. ci = n_top * ldc + m_top;
  704. __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
  705. m_top += 8;
  706. }
  707. if( M & 4 ) {
  708. gvl = __riscv_vsetvl_e16mf2(4);
  709. BLASLONG ai=m_top*K;
  710. BLASLONG bi=n_top*K;
  711. _Float16 B0 = B[bi+0];
  712. bi += 1;
  713. vfloat16mf2_t A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  714. ai += 4;
  715. vfloat32m1_t result0 = __riscv_vfwmul_vf_f32m1( A0, B0, gvl);
  716. for(BLASLONG k=1; k < K; ++k) {
  717. B0 = B[bi+0];
  718. bi += 1;
  719. A0 = __riscv_vle16_v_f16mf2( &A[ai+0*gvl], gvl );
  720. ai += 4;
  721. result0 = __riscv_vfwmacc_vf_f32m1(result0, B0, A0, gvl);
  722. }
  723. BLASLONG ci = n_top * ldc + m_top;
  724. vfloat32m1_t c0 = __riscv_vle32_v_f32m1(&C[ci], gvl);
  725. c0 = __riscv_vfmacc_vf_f32m1(c0, alpha, result0, gvl);
  726. ci= n_top * ldc + m_top;
  727. __riscv_vse32_v_f32m1(&C[ci], c0, gvl);
  728. m_top += 4;
  729. }
  730. if( M & 2 ) {
  731. float result0 = 0;
  732. float result1 = 0;
  733. BLASLONG ai = m_top * K;
  734. BLASLONG bi = n_top * K;
  735. for(BLASLONG k=0; k<K; k++) {
  736. result0+=(float)(A[ai+0]*B[bi+0]);
  737. result1+=(float)(A[ai+1]*B[bi+0]);
  738. ai+=2;
  739. bi+=1;
  740. }
  741. BLASLONG ci=n_top*ldc+m_top;
  742. C[ci + 0 * ldc + 0] += alpha * result0;
  743. C[ci + 0 * ldc + 1] += alpha * result1;
  744. m_top += 2;
  745. }
  746. if( M & 1 ) {
  747. float result0 = 0;
  748. BLASLONG ai = m_top * K;
  749. BLASLONG bi = n_top * K;
  750. for(BLASLONG k=0; k<K; k++) {
  751. result0+=(float)(A[ai+0]*B[bi+0]);
  752. ai+=1;
  753. bi+=1;
  754. }
  755. BLASLONG ci = n_top * ldc + m_top;
  756. C[ci + 0 * ldc + 0] += alpha * result0;
  757. m_top += 1;
  758. }
  759. n_top += 1;
  760. }
  761. return 0;
  762. }