You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x4_zvl128b.c 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. /*
  2. AUTOGENERATED KERNEL
  3. Script: ./kernel/riscv64/generate_kernel.py
  4. Settings:
  5. LMUL=4
  6. M=8
  7. M_tail_scalar_from=2
  8. N=4
  9. __riscv_='__riscv_'
  10. complex=False
  11. conjugate=False
  12. cpu='zvl128b'
  13. force_acc_double=False
  14. index_type='BLASLONG'
  15. op='gemm'
  16. param_precision='double'
  17. reg_width_bits=128
  18. tail_policy=''
  19. trace=False
  20. Derived:
  21. ELEN_ACC=64
  22. ELEN_PARAM=64
  23. LMUL_ACC=4
  24. VFMACC='__riscv_vfmacc_vf_f64m4'
  25. VFMUL='__riscv_vfmul_vf_f64m4'
  26. VLEV='__riscv_vle64_v_f64m4'
  27. VLSEV='__riscv_vlse64_v_f64m4'
  28. VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
  29. VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
  30. VSETVL='__riscv_vsetvl_e64m4'
  31. VSEV='__riscv_vse64_v_f64m4'
  32. VSSEV='__riscv_vsse64_v_f64m4'
  33. acc_vector_t='vfloat64m4_t'
  34. output='dgemm_kernel_8x4_zvl128b.c'
  35. param_scalar_t='double'
  36. param_vector_t='vfloat64m4_t'
  37. */
  38. #include "common.h"
  39. int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
  40. {
  41. BLASLONG gvl = 0;
  42. BLASLONG m_top = 0;
  43. BLASLONG n_top = 0;
  44. // -- MAIN PASS
  45. for (BLASLONG j = 0; j < N / 4; j += 1) {
  46. m_top = 0;
  47. BLASLONG gvl = __riscv_vsetvl_e64m4(8);
  48. for (BLASLONG i = 0; i < M / 8; i += 1) {
  49. BLASLONG ai = m_top * K;
  50. BLASLONG bi = n_top * K;
  51. double B0 = B[bi + 0];
  52. double B1 = B[bi + 1];
  53. double B2 = B[bi + 2];
  54. double B3 = B[bi + 3];
  55. bi += 4;
  56. vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  57. ai += 8;
  58. vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
  59. vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
  60. vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
  61. vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
  62. for (BLASLONG k = 1; k < K; k++) {
  63. B0 = B[bi + 0];
  64. B1 = B[bi + 1];
  65. B2 = B[bi + 2];
  66. B3 = B[bi + 3];
  67. bi += 4;
  68. A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  69. ai += 8;
  70. result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
  71. result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
  72. result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
  73. result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
  74. }
  75. BLASLONG ci = n_top * ldc + m_top;
  76. vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  77. ci += ldc - gvl * 0;
  78. vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  79. ci += ldc - gvl * 0;
  80. vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  81. ci += ldc - gvl * 0;
  82. vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  83. c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
  84. c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
  85. c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
  86. c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
  87. ci = n_top * ldc + m_top;
  88. __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
  89. ci += ldc - gvl * 0;
  90. __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
  91. ci += ldc - gvl * 0;
  92. __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
  93. ci += ldc - gvl * 0;
  94. __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
  95. m_top += 8;
  96. }
  97. // -- tails for main pass
  98. if (M & 4) {
  99. gvl = __riscv_vsetvl_e64m4(4);
  100. BLASLONG ai = m_top * K;
  101. BLASLONG bi = n_top * K;
  102. double B0 = B[bi + 0];
  103. double B1 = B[bi + 1];
  104. double B2 = B[bi + 2];
  105. double B3 = B[bi + 3];
  106. bi += 4;
  107. vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  108. ai += 4;
  109. vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
  110. vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
  111. vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
  112. vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
  113. for (BLASLONG k = 1; k < K; k++) {
  114. B0 = B[bi + 0];
  115. B1 = B[bi + 1];
  116. B2 = B[bi + 2];
  117. B3 = B[bi + 3];
  118. bi += 4;
  119. A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  120. ai += 4;
  121. result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
  122. result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
  123. result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
  124. result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
  125. }
  126. BLASLONG ci = n_top * ldc + m_top;
  127. vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  128. ci += ldc - gvl * 0;
  129. vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  130. ci += ldc - gvl * 0;
  131. vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  132. ci += ldc - gvl * 0;
  133. vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  134. c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
  135. c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
  136. c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
  137. c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
  138. ci = n_top * ldc + m_top;
  139. __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
  140. ci += ldc - gvl * 0;
  141. __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
  142. ci += ldc - gvl * 0;
  143. __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
  144. ci += ldc - gvl * 0;
  145. __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
  146. m_top += 4;
  147. }
  148. if (M & 2) {
  149. double result0 = 0;
  150. double result1 = 0;
  151. double result2 = 0;
  152. double result3 = 0;
  153. double result4 = 0;
  154. double result5 = 0;
  155. double result6 = 0;
  156. double result7 = 0;
  157. BLASLONG ai = m_top * K;
  158. BLASLONG bi = n_top * K;
  159. for (BLASLONG k = 0; k < K; k++) {
  160. result0 += A[ai + 0] * B[bi + 0];
  161. result1 += A[ai + 1] * B[bi + 0];
  162. result2 += A[ai + 0] * B[bi + 1];
  163. result3 += A[ai + 1] * B[bi + 1];
  164. result4 += A[ai + 0] * B[bi + 2];
  165. result5 += A[ai + 1] * B[bi + 2];
  166. result6 += A[ai + 0] * B[bi + 3];
  167. result7 += A[ai + 1] * B[bi + 3];
  168. ai += 2;
  169. bi += 4;
  170. }
  171. BLASLONG ci = n_top * ldc + m_top;
  172. C[ci + 0 * ldc + 0] += alpha * result0;
  173. C[ci + 0 * ldc + 1] += alpha * result1;
  174. C[ci + 1 * ldc + 0] += alpha * result2;
  175. C[ci + 1 * ldc + 1] += alpha * result3;
  176. C[ci + 2 * ldc + 0] += alpha * result4;
  177. C[ci + 2 * ldc + 1] += alpha * result5;
  178. C[ci + 3 * ldc + 0] += alpha * result6;
  179. C[ci + 3 * ldc + 1] += alpha * result7;
  180. m_top += 2;
  181. }
  182. if (M & 1) {
  183. double result0 = 0;
  184. double result1 = 0;
  185. double result2 = 0;
  186. double result3 = 0;
  187. BLASLONG ai = m_top * K;
  188. BLASLONG bi = n_top * K;
  189. for (BLASLONG k = 0; k < K; k++) {
  190. result0 += A[ai + 0] * B[bi + 0];
  191. result1 += A[ai + 0] * B[bi + 1];
  192. result2 += A[ai + 0] * B[bi + 2];
  193. result3 += A[ai + 0] * B[bi + 3];
  194. ai += 1;
  195. bi += 4;
  196. }
  197. BLASLONG ci = n_top * ldc + m_top;
  198. C[ci + 0 * ldc + 0] += alpha * result0;
  199. C[ci + 1 * ldc + 0] += alpha * result1;
  200. C[ci + 2 * ldc + 0] += alpha * result2;
  201. C[ci + 3 * ldc + 0] += alpha * result3;
  202. m_top += 1;
  203. }
  204. n_top += 4;
  205. }
  206. // -- tails for N=2
  207. if (N & 2) {
  208. gvl = __riscv_vsetvl_e64m4(8);
  209. m_top = 0;
  210. for (BLASLONG i = 0; i < M / 8; i += 1) {
  211. BLASLONG ai = m_top * K;
  212. BLASLONG bi = n_top * K;
  213. double B0 = B[bi + 0];
  214. double B1 = B[bi + 1];
  215. bi += 2;
  216. vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  217. ai += 8;
  218. vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
  219. vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
  220. for (BLASLONG k = 1; k < K; k++) {
  221. B0 = B[bi + 0];
  222. B1 = B[bi + 1];
  223. bi += 2;
  224. A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  225. ai += 8;
  226. result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
  227. result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
  228. }
  229. BLASLONG ci = n_top * ldc + m_top;
  230. vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  231. ci += ldc - gvl * 0;
  232. vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  233. c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
  234. c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
  235. ci = n_top * ldc + m_top;
  236. __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
  237. ci += ldc - gvl * 0;
  238. __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
  239. m_top += 8;
  240. }
  241. if (M & 4) {
  242. gvl = __riscv_vsetvl_e64m4(4);
  243. BLASLONG ai = m_top * K;
  244. BLASLONG bi = n_top * K;
  245. double B0 = B[bi + 0];
  246. double B1 = B[bi + 1];
  247. bi += 2;
  248. vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  249. ai += 4;
  250. vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
  251. vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
  252. for (BLASLONG k = 1; k < K; k++) {
  253. B0 = B[bi + 0];
  254. B1 = B[bi + 1];
  255. bi += 2;
  256. A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  257. ai += 4;
  258. result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
  259. result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
  260. }
  261. BLASLONG ci = n_top * ldc + m_top;
  262. vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  263. ci += ldc - gvl * 0;
  264. vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  265. c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
  266. c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
  267. ci = n_top * ldc + m_top;
  268. __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
  269. ci += ldc - gvl * 0;
  270. __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
  271. m_top += 4;
  272. }
  273. if (M & 2) {
  274. double result0 = 0;
  275. double result1 = 0;
  276. double result2 = 0;
  277. double result3 = 0;
  278. BLASLONG ai = m_top * K;
  279. BLASLONG bi = n_top * K;
  280. for (BLASLONG k = 0; k < K; k++) {
  281. result0 += A[ai + 0] * B[bi + 0];
  282. result1 += A[ai + 1] * B[bi + 0];
  283. result2 += A[ai + 0] * B[bi + 1];
  284. result3 += A[ai + 1] * B[bi + 1];
  285. ai += 2;
  286. bi += 2;
  287. }
  288. BLASLONG ci = n_top * ldc + m_top;
  289. C[ci + 0 * ldc + 0] += alpha * result0;
  290. C[ci + 0 * ldc + 1] += alpha * result1;
  291. C[ci + 1 * ldc + 0] += alpha * result2;
  292. C[ci + 1 * ldc + 1] += alpha * result3;
  293. m_top += 2;
  294. }
  295. if (M & 1) {
  296. double result0 = 0;
  297. double result1 = 0;
  298. BLASLONG ai = m_top * K;
  299. BLASLONG bi = n_top * K;
  300. for (BLASLONG k = 0; k < K; k++) {
  301. result0 += A[ai + 0] * B[bi + 0];
  302. result1 += A[ai + 0] * B[bi + 1];
  303. ai += 1;
  304. bi += 2;
  305. }
  306. BLASLONG ci = n_top * ldc + m_top;
  307. C[ci + 0 * ldc + 0] += alpha * result0;
  308. C[ci + 1 * ldc + 0] += alpha * result1;
  309. m_top += 1;
  310. }
  311. n_top += 2;
  312. }
  313. // -- tails for N=1
  314. if (N & 1) {
  315. gvl = __riscv_vsetvl_e64m4(8);
  316. m_top = 0;
  317. for (BLASLONG i = 0; i < M / 8; i += 1) {
  318. BLASLONG ai = m_top * K;
  319. BLASLONG bi = n_top * K;
  320. double B0 = B[bi + 0];
  321. bi += 1;
  322. vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  323. ai += 8;
  324. vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
  325. for (BLASLONG k = 1; k < K; k++) {
  326. B0 = B[bi + 0];
  327. bi += 1;
  328. A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  329. ai += 8;
  330. result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
  331. }
  332. BLASLONG ci = n_top * ldc + m_top;
  333. vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  334. c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
  335. ci = n_top * ldc + m_top;
  336. __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
  337. m_top += 8;
  338. }
  339. if (M & 4) {
  340. gvl = __riscv_vsetvl_e64m4(4);
  341. BLASLONG ai = m_top * K;
  342. BLASLONG bi = n_top * K;
  343. double B0 = B[bi + 0];
  344. bi += 1;
  345. vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  346. ai += 4;
  347. vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
  348. for (BLASLONG k = 1; k < K; k++) {
  349. B0 = B[bi + 0];
  350. bi += 1;
  351. A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
  352. ai += 4;
  353. result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
  354. }
  355. BLASLONG ci = n_top * ldc + m_top;
  356. vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
  357. c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
  358. ci = n_top * ldc + m_top;
  359. __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
  360. m_top += 4;
  361. }
  362. if (M & 2) {
  363. double result0 = 0;
  364. double result1 = 0;
  365. BLASLONG ai = m_top * K;
  366. BLASLONG bi = n_top * K;
  367. for (BLASLONG k = 0; k < K; k++) {
  368. result0 += A[ai + 0] * B[bi + 0];
  369. result1 += A[ai + 1] * B[bi + 0];
  370. ai += 2;
  371. bi += 1;
  372. }
  373. BLASLONG ci = n_top * ldc + m_top;
  374. C[ci + 0 * ldc + 0] += alpha * result0;
  375. C[ci + 0 * ldc + 1] += alpha * result1;
  376. m_top += 2;
  377. }
  378. if (M & 1) {
  379. double result0 = 0;
  380. BLASLONG ai = m_top * K;
  381. BLASLONG bi = n_top * K;
  382. for (BLASLONG k = 0; k < K; k++) {
  383. result0 += A[ai + 0] * B[bi + 0];
  384. ai += 1;
  385. bi += 1;
  386. }
  387. BLASLONG ci = n_top * ldc + m_top;
  388. C[ci + 0 * ldc + 0] += alpha * result0;
  389. m_top += 1;
  390. }
  391. n_top += 1;
  392. }
  393. return 0;
  394. }