You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x4_msa.c 60 kB


  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. #define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
  30. { \
  31. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
  32. LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
  33. \
  34. PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
  35. PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
  36. \
  37. /* 0th col */ \
  38. SPLATI_D2_DP(src_b0, src_br, src_bi); \
  39. res0_r OP0## = src_a0r * src_br; \
  40. res0_r OP1## = src_a0i * src_bi; \
  41. res0_i OP2## = OP4 src_a0r * src_bi; \
  42. res0_i OP3## = src_a0i * src_br; \
  43. \
  44. res1_r OP0## = src_a1r * src_br; \
  45. res1_r OP1## = src_a1i * src_bi; \
  46. res1_i OP2## = OP4 src_a1r * src_bi; \
  47. res1_i OP3## = src_a1i * src_br; \
  48. \
  49. /* 1st col */ \
  50. SPLATI_D2_DP(src_b1, src_br, src_bi); \
  51. res2_r OP0## = src_a0r * src_br; \
  52. res2_r OP1## = src_a0i * src_bi; \
  53. res2_i OP2## = OP4 src_a0r * src_bi; \
  54. res2_i OP3## = src_a0i * src_br; \
  55. \
  56. res3_r OP0## = src_a1r * src_br; \
  57. res3_r OP1## = src_a1i * src_bi; \
  58. res3_i OP2## = OP4 src_a1r * src_bi; \
  59. res3_i OP3## = src_a1i * src_br; \
  60. \
  61. /* 2nd col */ \
  62. SPLATI_D2_DP(src_b2, src_br, src_bi); \
  63. res4_r OP0## = src_a0r * src_br; \
  64. res4_r OP1## = src_a0i * src_bi; \
  65. res4_i OP2## = OP4 src_a0r * src_bi; \
  66. res4_i OP3## = src_a0i * src_br; \
  67. \
  68. res5_r OP0## = src_a1r * src_br; \
  69. res5_r OP1## = src_a1i * src_bi; \
  70. res5_i OP2## = OP4 src_a1r * src_bi; \
  71. res5_i OP3## = src_a1i * src_br; \
  72. \
  73. /* 3rd col */ \
  74. SPLATI_D2_DP(src_b3, src_br, src_bi); \
  75. res6_r OP0## = src_a0r * src_br; \
  76. res6_r OP1## = src_a0i * src_bi; \
  77. res6_i OP2## = OP4 src_a0r * src_bi; \
  78. res6_i OP3## = src_a0i * src_br; \
  79. \
  80. res7_r OP0## = src_a1r * src_br; \
  81. res7_r OP1## = src_a1i * src_bi; \
  82. res7_i OP2## = OP4 src_a1r * src_bi; \
  83. res7_i OP3## = src_a1i * src_br; \
  84. }
  85. #define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4) \
  86. { \
  87. LD_DP2_INC(pa0, 2, src_a0, src_a1); \
  88. LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
  89. \
  90. PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
  91. \
  92. /* 0th col */ \
  93. SPLATI_D2_DP(src_b0, src_br, src_bi); \
  94. res0_r OP0## = src_a0r * src_br; \
  95. res0_r OP1## = src_a0i * src_bi; \
  96. res0_i OP2## = OP4 src_a0r * src_bi; \
  97. res0_i OP3## = src_a0i * src_br; \
  98. \
  99. /* 1st col */ \
  100. SPLATI_D2_DP(src_b1, src_br, src_bi); \
  101. res2_r OP0## = src_a0r * src_br; \
  102. res2_r OP1## = src_a0i * src_bi; \
  103. res2_i OP2## = OP4 src_a0r * src_bi; \
  104. res2_i OP3## = src_a0i * src_br; \
  105. \
  106. /* 2nd col */ \
  107. SPLATI_D2_DP(src_b2, src_br, src_bi); \
  108. res4_r OP0## = src_a0r * src_br; \
  109. res4_r OP1## = src_a0i * src_bi; \
  110. res4_i OP2## = OP4 src_a0r * src_bi; \
  111. res4_i OP3## = src_a0i * src_br; \
  112. \
  113. /* 3rd col */ \
  114. SPLATI_D2_DP(src_b3, src_br, src_bi); \
  115. res6_r OP0## = src_a0r * src_br; \
  116. res6_r OP1## = src_a0i * src_bi; \
  117. res6_i OP2## = OP4 src_a0r * src_bi; \
  118. res6_i OP3## = src_a0i * src_br; \
  119. }
  120. #define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4) \
  121. { \
  122. src_a0 = LD_DP(pa0); \
  123. LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
  124. \
  125. PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \
  126. \
  127. /* 0th and 1st col */ \
  128. PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \
  129. res0_r OP0## = src_a0r * src_br; \
  130. res0_r OP1## = src_a0i * src_bi; \
  131. res0_i OP2## = OP4 src_a0r * src_bi; \
  132. res0_i OP3## = src_a0i * src_br; \
  133. \
  134. /* 2nd and 3rd col */ \
  135. PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi); \
  136. res1_r OP0## = src_a0r * src_br; \
  137. res1_r OP1## = src_a0i * src_bi; \
  138. res1_i OP2## = OP4 src_a0r * src_bi; \
  139. res1_i OP3## = src_a0i * src_br; \
  140. }
  141. #define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
  142. { \
  143. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
  144. LD_DP2_INC(pb0, 2, src_b0, src_b1); \
  145. \
  146. PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
  147. PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
  148. \
  149. /* 0th col */ \
  150. SPLATI_D2_DP(src_b0, src_br, src_bi); \
  151. res0_r OP0## = src_a0r * src_br; \
  152. res0_r OP1## = src_a0i * src_bi; \
  153. res0_i OP2## = OP4 src_a0r * src_bi; \
  154. res0_i OP3## = src_a0i * src_br; \
  155. \
  156. res1_r OP0## = src_a1r * src_br; \
  157. res1_r OP1## = src_a1i * src_bi; \
  158. res1_i OP2## = OP4 src_a1r * src_bi; \
  159. res1_i OP3## = src_a1i * src_br; \
  160. \
  161. /* 1st col */ \
  162. SPLATI_D2_DP(src_b1, src_br, src_bi); \
  163. res2_r OP0## = src_a0r * src_br; \
  164. res2_r OP1## = src_a0i * src_bi; \
  165. res2_i OP2## = OP4 src_a0r * src_bi; \
  166. res2_i OP3## = src_a0i * src_br; \
  167. \
  168. res3_r OP0## = src_a1r * src_br; \
  169. res3_r OP1## = src_a1i * src_bi; \
  170. res3_i OP2## = OP4 src_a1r * src_bi; \
  171. res3_i OP3## = src_a1i * src_br; \
  172. }
  173. #define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4) \
  174. { \
  175. LD_DP2_INC(pa0, 2, src_a0, src_a1); \
  176. LD_DP2_INC(pb0, 2, src_b0, src_b1); \
  177. \
  178. PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
  179. \
  180. /* 0th col */ \
  181. SPLATI_D2_DP(src_b0, src_br, src_bi); \
  182. res0_r OP0## = src_a0r * src_br; \
  183. res0_r OP1## = src_a0i * src_bi; \
  184. res0_i OP2## = OP4 src_a0r * src_bi; \
  185. res0_i OP3## = src_a0i * src_br; \
  186. \
  187. /* 1st col */ \
  188. SPLATI_D2_DP(src_b1, src_br, src_bi); \
  189. res2_r OP0## = src_a0r * src_br; \
  190. res2_r OP1## = src_a0i * src_bi; \
  191. res2_i OP2## = OP4 src_a0r * src_bi; \
  192. res2_i OP3## = src_a0i * src_br; \
  193. }
  194. #define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4) \
  195. { \
  196. src_a0 = LD_DP(pa0); \
  197. LD_DP2_INC(pb0, 2, src_b0, src_b1); \
  198. \
  199. PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \
  200. \
  201. /* 0th and 1st col */ \
  202. PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \
  203. res0_r OP0## = src_a0r * src_br; \
  204. res0_r OP1## = src_a0i * src_bi; \
  205. res0_i OP2## = OP4 src_a0r * src_bi; \
  206. res0_i OP3## = src_a0i * src_br; \
  207. }
  208. #define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
  209. { \
  210. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
  211. src_b0 = LD_DP(pb0); \
  212. \
  213. PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
  214. PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
  215. \
  216. /* 0th col */ \
  217. SPLATI_D2_DP(src_b0, src_br, src_bi); \
  218. res0_r OP0## = src_a0r * src_br; \
  219. res0_r OP1## = src_a0i * src_bi; \
  220. res0_i OP2## = OP4 src_a0r * src_bi; \
  221. res0_i OP3## = src_a0i * src_br; \
  222. \
  223. res1_r OP0## = src_a1r * src_br; \
  224. res1_r OP1## = src_a1i * src_bi; \
  225. res1_i OP2## = OP4 src_a1r * src_bi; \
  226. res1_i OP3## = src_a1i * src_br; \
  227. }
  228. #define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4) \
  229. { \
  230. LD_DP2_INC(pa0, 2, src_a0, src_a1); \
  231. src_b0 = LD_DP(pb0); \
  232. \
  233. PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
  234. \
  235. /* 0th col */ \
  236. SPLATI_D2_DP(src_b0, src_br, src_bi); \
  237. res0_r OP0## = src_a0r * src_br; \
  238. res0_r OP1## = src_a0i * src_bi; \
  239. res0_i OP2## = OP4 src_a0r * src_bi; \
  240. res0_i OP3## = src_a0i * src_br; \
  241. }
  242. #define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
  243. { \
  244. /* 0th col */ \
  245. a0_r = pa0[0]; \
  246. a0_i = pa0[1]; \
  247. b0_r = pb0[0]; \
  248. b0_i = pb0[1]; \
  249. \
  250. res0 OP0## = a0_r * b0_r; \
  251. res0 OP1## = a0_i * b0_i; \
  252. res1 OP2## = OP4 a0_r * b0_i; \
  253. res1 OP3## = a0_i * b0_r; \
  254. }
  255. #define ZGEMM_SCALE_4X4_MSA \
  256. { \
  257. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
  258. \
  259. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  260. PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
  261. \
  262. dst0_r += alpha_r * res0_r; \
  263. dst0_r -= alpha_i * res0_i; \
  264. dst0_i += alpha_r * res0_i; \
  265. dst0_i += alpha_i * res0_r; \
  266. \
  267. dst1_r += alpha_r * res1_r; \
  268. dst1_r -= alpha_i * res1_i; \
  269. dst1_i += alpha_r * res1_i; \
  270. dst1_i += alpha_i * res1_r; \
  271. \
  272. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  273. ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
  274. \
  275. LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \
  276. \
  277. PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
  278. PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
  279. \
  280. dst0_r += alpha_r * res2_r; \
  281. dst0_r -= alpha_i * res2_i; \
  282. dst0_i += alpha_r * res2_i; \
  283. dst0_i += alpha_i * res2_r; \
  284. \
  285. dst1_r += alpha_r * res3_r; \
  286. dst1_r -= alpha_i * res3_i; \
  287. dst1_i += alpha_r * res3_i; \
  288. dst1_i += alpha_i * res3_r; \
  289. \
  290. ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
  291. ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
  292. \
  293. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
  294. ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
  295. \
  296. LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); \
  297. \
  298. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  299. PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
  300. \
  301. dst0_r += alpha_r * res4_r; \
  302. dst0_r -= alpha_i * res4_i; \
  303. dst0_i += alpha_r * res4_i; \
  304. dst0_i += alpha_i * res4_r; \
  305. \
  306. dst1_r += alpha_r * res5_r; \
  307. dst1_r -= alpha_i * res5_i; \
  308. dst1_i += alpha_r * res5_i; \
  309. dst1_i += alpha_i * res5_r; \
  310. \
  311. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  312. ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
  313. \
  314. LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); \
  315. \
  316. PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
  317. PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
  318. \
  319. dst0_r += alpha_r * res6_r; \
  320. dst0_r -= alpha_i * res6_i; \
  321. dst0_i += alpha_r * res6_i; \
  322. dst0_i += alpha_i * res6_r; \
  323. \
  324. dst1_r += alpha_r * res7_r; \
  325. dst1_r -= alpha_i * res7_i; \
  326. dst1_i += alpha_r * res7_i; \
  327. dst1_i += alpha_i * res7_r; \
  328. \
  329. ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
  330. ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
  331. \
  332. ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \
  333. ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \
  334. }
  335. #define ZGEMM_SCALE_2X4_MSA \
  336. { \
  337. LD_DP2(pc0, 2, dst0, dst1); \
  338. \
  339. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  340. \
  341. dst0_r += alpha_r * res0_r; \
  342. dst0_r -= alpha_i * res0_i; \
  343. dst0_i += alpha_r * res0_i; \
  344. dst0_i += alpha_i * res0_r; \
  345. \
  346. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  347. \
  348. LD_DP2(pc1, 2, dst2, dst3); \
  349. \
  350. PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
  351. \
  352. dst0_r += alpha_r * res2_r; \
  353. dst0_r -= alpha_i * res2_i; \
  354. dst0_i += alpha_r * res2_i; \
  355. dst0_i += alpha_i * res2_r; \
  356. \
  357. ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
  358. \
  359. ST_DP2_INC(dst0, dst1, pc0, 2); \
  360. ST_DP2_INC(dst2, dst3, pc1, 2); \
  361. \
  362. LD_DP2(pc2, 2, dst0, dst1); \
  363. \
  364. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  365. \
  366. dst0_r += alpha_r * res4_r; \
  367. dst0_r -= alpha_i * res4_i; \
  368. dst0_i += alpha_r * res4_i; \
  369. dst0_i += alpha_i * res4_r; \
  370. \
  371. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  372. \
  373. LD_DP2(pc3, 2, dst2, dst3); \
  374. \
  375. PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
  376. \
  377. dst0_r += alpha_r * res6_r; \
  378. dst0_r -= alpha_i * res6_i; \
  379. dst0_i += alpha_r * res6_i; \
  380. dst0_i += alpha_i * res6_r; \
  381. \
  382. ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
  383. \
  384. ST_DP2_INC(dst0, dst1, pc2, 2); \
  385. ST_DP2_INC(dst2, dst3, pc3, 2); \
  386. }
  387. #define ZGEMM_SCALE_1X4_MSA \
  388. { \
  389. dst0 = LD_DP(pc0); \
  390. dst1 = LD_DP(pc1); \
  391. \
  392. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  393. \
  394. dst0_r += alpha_r * res0_r; \
  395. dst0_r -= alpha_i * res0_i; \
  396. dst0_i += alpha_r * res0_i; \
  397. dst0_i += alpha_i * res0_r; \
  398. \
  399. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  400. \
  401. dst2 = LD_DP(pc2); \
  402. dst3 = LD_DP(pc3); \
  403. \
  404. PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
  405. \
  406. dst0_r += alpha_r * res1_r; \
  407. dst0_r -= alpha_i * res1_i; \
  408. dst0_i += alpha_r * res1_i; \
  409. dst0_i += alpha_i * res1_r; \
  410. \
  411. ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
  412. \
  413. ST_DP(dst0, pc0); \
  414. ST_DP(dst1, pc1); \
  415. ST_DP(dst2, pc2); \
  416. ST_DP(dst3, pc3); \
  417. }
  418. #define ZGEMM_SCALE_4X2_MSA \
  419. { \
  420. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
  421. \
  422. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  423. PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
  424. \
  425. dst0_r += alpha_r * res0_r; \
  426. dst0_r -= alpha_i * res0_i; \
  427. dst0_i += alpha_r * res0_i; \
  428. dst0_i += alpha_i * res0_r; \
  429. \
  430. dst1_r += alpha_r * res1_r; \
  431. dst1_r -= alpha_i * res1_i; \
  432. dst1_i += alpha_r * res1_i; \
  433. dst1_i += alpha_i * res1_r; \
  434. \
  435. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  436. ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
  437. \
  438. LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \
  439. \
  440. PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
  441. PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
  442. \
  443. dst0_r += alpha_r * res2_r; \
  444. dst0_r -= alpha_i * res2_i; \
  445. dst0_i += alpha_r * res2_i; \
  446. dst0_i += alpha_i * res2_r; \
  447. \
  448. dst1_r += alpha_r * res3_r; \
  449. dst1_r -= alpha_i * res3_i; \
  450. dst1_i += alpha_r * res3_i; \
  451. dst1_i += alpha_i * res3_r; \
  452. \
  453. ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
  454. ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
  455. \
  456. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
  457. ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
  458. }
  459. #define ZGEMM_SCALE_2X2_MSA \
  460. { \
  461. LD_DP2(pc0, 2, dst0, dst1); \
  462. \
  463. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  464. \
  465. dst0_r += alpha_r * res0_r; \
  466. dst0_r -= alpha_i * res0_i; \
  467. dst0_i += alpha_r * res0_i; \
  468. dst0_i += alpha_i * res0_r; \
  469. \
  470. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  471. \
  472. ST_DP2_INC(dst0, dst1, pc0, 2); \
  473. \
  474. LD_DP2(pc1, 2, dst2, dst3); \
  475. \
  476. PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
  477. \
  478. dst0_r += alpha_r * res2_r; \
  479. dst0_r -= alpha_i * res2_i; \
  480. dst0_i += alpha_r * res2_i; \
  481. dst0_i += alpha_i * res2_r; \
  482. \
  483. ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
  484. \
  485. ST_DP2_INC(dst2, dst3, pc1, 2); \
  486. }
  487. #define ZGEMM_SCALE_1X2_MSA \
  488. { \
  489. dst0 = LD_DP(pc0); \
  490. dst1 = LD_DP(pc1); \
  491. \
  492. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  493. \
  494. dst0_r += alpha_r * res0_r; \
  495. dst0_r -= alpha_i * res0_i; \
  496. dst0_i += alpha_r * res0_i; \
  497. dst0_i += alpha_i * res0_r; \
  498. \
  499. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  500. \
  501. ST_DP(dst0, pc0); \
  502. ST_DP(dst1, pc1); \
  503. }
  504. #define ZGEMM_SCALE_4X1_MSA \
  505. { \
  506. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
  507. \
  508. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  509. PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
  510. \
  511. dst0_r += alpha_r * res0_r; \
  512. dst0_r -= alpha_i * res0_i; \
  513. dst0_i += alpha_r * res0_i; \
  514. dst0_i += alpha_i * res0_r; \
  515. \
  516. dst1_r += alpha_r * res1_r; \
  517. dst1_r -= alpha_i * res1_i; \
  518. dst1_i += alpha_r * res1_i; \
  519. dst1_i += alpha_i * res1_r; \
  520. \
  521. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  522. ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
  523. \
  524. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
  525. }
  526. #define ZGEMM_SCALE_2X1_MSA \
  527. { \
  528. LD_DP2(pc0, 2, dst0, dst1); \
  529. \
  530. PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
  531. \
  532. dst0_r += alpha_r * res0_r; \
  533. dst0_r -= alpha_i * res0_i; \
  534. dst0_i += alpha_r * res0_i; \
  535. dst0_i += alpha_i * res0_r; \
  536. \
  537. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  538. \
  539. ST_DP2_INC(dst0, dst1, pc0, 2); \
  540. }
  541. #define ZGEMM_SCALE_1X1 \
  542. { \
  543. pc0[0] += alphar * res0; \
  544. pc0[0] -= alphai * res1; \
  545. pc0[1] += alphar * res1; \
  546. pc0[1] += alphai * res0; \
  547. }
  548. #define ZGEMM_TRMM_SCALE_4X4_MSA \
  549. { \
  550. dst0_r = alpha_r * res0_r; \
  551. dst0_r -= alpha_i * res0_i; \
  552. dst0_i = alpha_r * res0_i; \
  553. dst0_i += alpha_i * res0_r; \
  554. \
  555. dst1_r = alpha_r * res1_r; \
  556. dst1_r -= alpha_i * res1_i; \
  557. dst1_i = alpha_r * res1_i; \
  558. dst1_i += alpha_i * res1_r; \
  559. \
  560. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  561. ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
  562. \
  563. dst0_r = alpha_r * res2_r; \
  564. dst0_r -= alpha_i * res2_i; \
  565. dst0_i = alpha_r * res2_i; \
  566. dst0_i += alpha_i * res2_r; \
  567. \
  568. dst1_r = alpha_r * res3_r; \
  569. dst1_r -= alpha_i * res3_i; \
  570. dst1_i = alpha_r * res3_i; \
  571. dst1_i += alpha_i * res3_r; \
  572. \
  573. ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
  574. ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
  575. \
  576. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
  577. ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
  578. \
  579. dst0_r = alpha_r * res4_r; \
  580. dst0_r -= alpha_i * res4_i; \
  581. dst0_i = alpha_r * res4_i; \
  582. dst0_i += alpha_i * res4_r; \
  583. \
  584. dst1_r = alpha_r * res5_r; \
  585. dst1_r -= alpha_i * res5_i; \
  586. dst1_i = alpha_r * res5_i; \
  587. dst1_i += alpha_i * res5_r; \
  588. \
  589. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  590. ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
  591. \
  592. dst0_r = alpha_r * res6_r; \
  593. dst0_r -= alpha_i * res6_i; \
  594. dst0_i = alpha_r * res6_i; \
  595. dst0_i += alpha_i * res6_r; \
  596. \
  597. dst1_r = alpha_r * res7_r; \
  598. dst1_r -= alpha_i * res7_i; \
  599. dst1_i = alpha_r * res7_i; \
  600. dst1_i += alpha_i * res7_r; \
  601. \
  602. ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
  603. ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
  604. \
  605. ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \
  606. ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \
  607. }
  608. #define ZGEMM_TRMM_SCALE_2X4_MSA \
  609. { \
  610. dst0_r = alpha_r * res0_r; \
  611. dst0_r -= alpha_i * res0_i; \
  612. dst0_i = alpha_r * res0_i; \
  613. dst0_i += alpha_i * res0_r; \
  614. \
  615. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  616. \
  617. dst0_r = alpha_r * res2_r; \
  618. dst0_r -= alpha_i * res2_i; \
  619. dst0_i = alpha_r * res2_i; \
  620. dst0_i += alpha_i * res2_r; \
  621. \
  622. ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
  623. \
  624. ST_DP2_INC(dst0, dst1, pc0, 2); \
  625. ST_DP2_INC(dst2, dst3, pc1, 2); \
  626. \
  627. dst0_r = alpha_r * res4_r; \
  628. dst0_r -= alpha_i * res4_i; \
  629. dst0_i = alpha_r * res4_i; \
  630. dst0_i += alpha_i * res4_r; \
  631. \
  632. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  633. \
  634. dst0_r = alpha_r * res6_r; \
  635. dst0_r -= alpha_i * res6_i; \
  636. dst0_i = alpha_r * res6_i; \
  637. dst0_i += alpha_i * res6_r; \
  638. \
  639. ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
  640. \
  641. ST_DP2_INC(dst0, dst1, pc2, 2); \
  642. ST_DP2_INC(dst2, dst3, pc3, 2); \
  643. }
  644. #define ZGEMM_TRMM_SCALE_1X4_MSA \
  645. { \
  646. dst0_r = alpha_r * res0_r; \
  647. dst0_r -= alpha_i * res0_i; \
  648. dst0_i = alpha_r * res0_i; \
  649. dst0_i += alpha_i * res0_r; \
  650. \
  651. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  652. \
  653. dst0_r = alpha_r * res1_r; \
  654. dst0_r -= alpha_i * res1_i; \
  655. dst0_i = alpha_r * res1_i; \
  656. dst0_i += alpha_i * res1_r; \
  657. \
  658. ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
  659. \
  660. ST_DP(dst0, pc0); \
  661. ST_DP(dst1, pc1); \
  662. ST_DP(dst2, pc2); \
  663. ST_DP(dst3, pc3); \
  664. }
  665. #define ZGEMM_TRMM_SCALE_4X2_MSA \
  666. { \
  667. dst0_r = alpha_r * res0_r; \
  668. dst0_r -= alpha_i * res0_i; \
  669. dst0_i = alpha_r * res0_i; \
  670. dst0_i += alpha_i * res0_r; \
  671. \
  672. dst1_r = alpha_r * res1_r; \
  673. dst1_r -= alpha_i * res1_i; \
  674. dst1_i = alpha_r * res1_i; \
  675. dst1_i += alpha_i * res1_r; \
  676. \
  677. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  678. ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
  679. \
  680. dst0_r = alpha_r * res2_r; \
  681. dst0_r -= alpha_i * res2_i; \
  682. dst0_i = alpha_r * res2_i; \
  683. dst0_i += alpha_i * res2_r; \
  684. \
  685. dst1_r = alpha_r * res3_r; \
  686. dst1_r -= alpha_i * res3_i; \
  687. dst1_i = alpha_r * res3_i; \
  688. dst1_i += alpha_i * res3_r; \
  689. \
  690. ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
  691. ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
  692. \
  693. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
  694. ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
  695. }
  696. #define ZGEMM_TRMM_SCALE_2X2_MSA \
  697. { \
  698. dst0_r = alpha_r * res0_r; \
  699. dst0_r -= alpha_i * res0_i; \
  700. dst0_i = alpha_r * res0_i; \
  701. dst0_i += alpha_i * res0_r; \
  702. \
  703. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  704. \
  705. ST_DP2_INC(dst0, dst1, pc0, 2); \
  706. \
  707. dst0_r = alpha_r * res2_r; \
  708. dst0_r -= alpha_i * res2_i; \
  709. dst0_i = alpha_r * res2_i; \
  710. dst0_i += alpha_i * res2_r; \
  711. \
  712. ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
  713. \
  714. ST_DP2_INC(dst2, dst3, pc1, 2); \
  715. }
  716. #define ZGEMM_TRMM_SCALE_1X2_MSA \
  717. { \
  718. dst0_r = alpha_r * res0_r; \
  719. dst0_r -= alpha_i * res0_i; \
  720. dst0_i = alpha_r * res0_i; \
  721. dst0_i += alpha_i * res0_r; \
  722. \
  723. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  724. \
  725. ST_DP(dst0, pc0); \
  726. ST_DP(dst1, pc1); \
  727. }
  728. #define ZGEMM_TRMM_SCALE_4X1_MSA \
  729. { \
  730. dst0_r = alpha_r * res0_r; \
  731. dst0_r -= alpha_i * res0_i; \
  732. dst0_i = alpha_r * res0_i; \
  733. dst0_i += alpha_i * res0_r; \
  734. \
  735. dst1_r = alpha_r * res1_r; \
  736. dst1_r -= alpha_i * res1_i; \
  737. dst1_i = alpha_r * res1_i; \
  738. dst1_i += alpha_i * res1_r; \
  739. \
  740. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  741. ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
  742. \
  743. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
  744. }
  745. #define ZGEMM_TRMM_SCALE_2X1_MSA \
  746. { \
  747. dst0_r = alpha_r * res0_r; \
  748. dst0_r -= alpha_i * res0_i; \
  749. dst0_i = alpha_r * res0_i; \
  750. dst0_i += alpha_i * res0_r; \
  751. \
  752. ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
  753. \
  754. ST_DP2_INC(dst0, dst1, pc0, 2); \
  755. }
  756. #define ZGEMM_TRMM_SCALE_1X1 \
  757. { \
  758. pc0[0] = alphar * res0; \
  759. pc0[0] -= alphai * res1; \
  760. pc0[1] = alphar * res1; \
  761. pc0[1] += alphai * res0; \
  762. }
  763. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
  764. FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
  765. #ifdef TRMMKERNEL
  766. , BLASLONG offset
  767. #endif
  768. )
  769. {
  770. BLASLONG i, j, l, temp;
  771. #if defined(TRMMKERNEL)
  772. BLASLONG off;
  773. #endif
  774. FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
  775. FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i;
  776. v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3;
  777. v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
  778. v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  779. v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i;
  780. v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
  781. v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
  782. alpha_r = COPY_DOUBLE_TO_VECTOR(alphar);
  783. alpha_i = COPY_DOUBLE_TO_VECTOR(alphai);
  784. #if defined(TRMMKERNEL) && !defined(LEFT)
  785. off = -offset;
  786. #endif
  787. for (j = (n >> 2); j--;)
  788. {
  789. pc0 = C;
  790. pc1 = pc0 + 2 * ldc;
  791. pc2 = pc1 + 2 * ldc;
  792. pc3 = pc2 + 2 * ldc;
  793. pa0 = A;
  794. #if defined(TRMMKERNEL) && defined(LEFT)
  795. off = offset;
  796. #endif
  797. for (i = (m >> 2); i--;)
  798. {
  799. #if defined(TRMMKERNEL)
  800. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  801. pb0 = B;
  802. #else
  803. pa0 += off * 2 * 4;
  804. pb0 = B + off * 2 * 4;
  805. #endif
  806. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  807. temp = k - off;
  808. #elif defined(LEFT)
  809. temp = off + 4; // number of values in A
  810. #else
  811. temp = off + 4; // number of values in B
  812. #endif
  813. #else
  814. pb0 = B;
  815. temp = k;
  816. #endif
  817. #ifdef ENABLE_PREFETCH
  818. __asm__ __volatile__(
  819. "pref 0, 64(%[pa0]) \n\t"
  820. "pref 0, 96(%[pa0]) \n\t"
  821. "pref 0, 64(%[pb0]) \n\t"
  822. "pref 0, 96(%[pb0]) \n\t"
  823. :
  824. : [pa0] "r" (pa0), [pb0] "r" (pb0)
  825. );
  826. #endif
  827. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  828. ZGEMM_KERNEL_4X4_MSA(, -, , +, +);
  829. #endif
  830. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  831. ZGEMM_KERNEL_4X4_MSA(, +, , +, -);
  832. #endif
  833. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  834. ZGEMM_KERNEL_4X4_MSA(, +, , -, +);
  835. #endif
  836. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  837. ZGEMM_KERNEL_4X4_MSA(, -, , -, -);
  838. #endif
  839. for (l = (temp - 1); l--;)
  840. {
  841. #ifdef ENABLE_PREFETCH
  842. __asm__ __volatile__(
  843. "pref 0, 64(%[pa0]) \n\t"
  844. "pref 0, 96(%[pa0]) \n\t"
  845. "pref 0, 64(%[pb0]) \n\t"
  846. "pref 0, 96(%[pb0]) \n\t"
  847. :
  848. : [pa0] "r" (pa0), [pb0] "r" (pb0)
  849. );
  850. #endif
  851. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  852. ZGEMM_KERNEL_4X4_MSA(+, -, +, +,);
  853. #endif
  854. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  855. ZGEMM_KERNEL_4X4_MSA(+, +, -, +,);
  856. #endif
  857. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  858. ZGEMM_KERNEL_4X4_MSA(+, +, +, -,);
  859. #endif
  860. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  861. ZGEMM_KERNEL_4X4_MSA(+, -, -, -,);
  862. #endif
  863. }
  864. #if defined(TRMMKERNEL)
  865. ZGEMM_TRMM_SCALE_4X4_MSA
  866. #else
  867. ZGEMM_SCALE_4X4_MSA
  868. #endif
  869. #if defined(TRMMKERNEL)
  870. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  871. temp = k - off;
  872. #ifdef LEFT
  873. temp -= 4; // number of values in A
  874. #else
  875. temp -= 4; // number of values in B
  876. #endif
  877. pa0 += temp * 2 * 4;
  878. pb0 += temp * 2 * 4;
  879. #endif
  880. #ifdef LEFT
  881. off += 4; // number of values in A
  882. #endif
  883. #endif
  884. }
  885. if (m & 2)
  886. {
  887. #if defined(TRMMKERNEL)
  888. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  889. pb0 = B;
  890. #else
  891. pa0 += off * 2 * 2;
  892. pb0 = B + off * 2 * 4;
  893. #endif
  894. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  895. temp = k - off;
  896. #elif defined(LEFT)
  897. temp = off + 2; // number of values in A
  898. #else
  899. temp = off + 4; // number of values in B
  900. #endif
  901. #else
  902. pb0 = B;
  903. temp = k;
  904. #endif
  905. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  906. ZGEMM_KERNEL_2X4_MSA(, -, , +, +);
  907. #endif
  908. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  909. ZGEMM_KERNEL_2X4_MSA(, +, , +, -);
  910. #endif
  911. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  912. ZGEMM_KERNEL_2X4_MSA(, +, , -, +);
  913. #endif
  914. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  915. ZGEMM_KERNEL_2X4_MSA(, -, , -, -);
  916. #endif
  917. for (l = (temp - 1); l--;)
  918. {
  919. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  920. ZGEMM_KERNEL_2X4_MSA(+, -, +, +,);
  921. #endif
  922. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  923. ZGEMM_KERNEL_2X4_MSA(+, +, -, +,);
  924. #endif
  925. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  926. ZGEMM_KERNEL_2X4_MSA(+, +, +, -,);
  927. #endif
  928. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  929. ZGEMM_KERNEL_2X4_MSA(+, -, -, -,);
  930. #endif
  931. }
  932. #if defined(TRMMKERNEL)
  933. ZGEMM_TRMM_SCALE_2X4_MSA
  934. #else
  935. ZGEMM_SCALE_2X4_MSA
  936. #endif
  937. #if defined(TRMMKERNEL)
  938. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  939. temp = k - off;
  940. #ifdef LEFT
  941. temp -= 2; // number of values in A
  942. #else
  943. temp -= 4; // number of values in B
  944. #endif
  945. pa0 += temp * 2 * 2;
  946. pb0 += temp * 2 * 4;
  947. #endif
  948. #ifdef LEFT
  949. off += 2; // number of values in A
  950. #endif
  951. #endif
  952. }
  953. if (m & 1)
  954. {
  955. #if defined(TRMMKERNEL)
  956. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  957. pb0 = B;
  958. #else
  959. pa0 += off * 2 * 1;
  960. pb0 = B + off * 2 * 4;
  961. #endif
  962. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  963. temp = k - off;
  964. #elif defined(LEFT)
  965. temp = off + 1; // number of values in A
  966. #else
  967. temp = off + 4; // number of values in B
  968. #endif
  969. #else
  970. pb0 = B;
  971. temp = k;
  972. #endif
  973. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  974. ZGEMM_KERNEL_1X4_MSA(, -, , +, +);
  975. #endif
  976. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  977. ZGEMM_KERNEL_1X4_MSA(, +, , +, -);
  978. #endif
  979. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  980. ZGEMM_KERNEL_1X4_MSA(, +, , -, +);
  981. #endif
  982. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  983. ZGEMM_KERNEL_1X4_MSA(, -, , -, -);
  984. #endif
  985. pa0 += 2;
  986. for (l = (temp - 1); l--;)
  987. {
  988. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  989. ZGEMM_KERNEL_1X4_MSA(+, -, +, +,);
  990. #endif
  991. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  992. ZGEMM_KERNEL_1X4_MSA(+, +, -, +,);
  993. #endif
  994. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  995. ZGEMM_KERNEL_1X4_MSA(+, +, +, -,);
  996. #endif
  997. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  998. ZGEMM_KERNEL_1X4_MSA(+, -, -, -,);
  999. #endif
  1000. pa0 += 2;
  1001. }
  1002. #if defined(TRMMKERNEL)
  1003. ZGEMM_TRMM_SCALE_1X4_MSA
  1004. #else
  1005. ZGEMM_SCALE_1X4_MSA
  1006. #endif
  1007. pc0 += 2;
  1008. pc1 += 2;
  1009. pc2 += 2;
  1010. pc3 += 2;
  1011. #if defined(TRMMKERNEL)
  1012. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1013. temp = k - off;
  1014. #ifdef LEFT
  1015. temp -= 1; // number of values in A
  1016. #else
  1017. temp -= 4; // number of values in B
  1018. #endif
  1019. pa0 += temp * 2 * 1;
  1020. pb0 += temp * 2 * 4;
  1021. #endif
  1022. #ifdef LEFT
  1023. off += 1; // number of values in A
  1024. #endif
  1025. #endif
  1026. }
  1027. #if defined(TRMMKERNEL) && !defined(LEFT)
  1028. off += 4; // number of values in A
  1029. #endif
  1030. B += (k << 3);
  1031. C += (ldc << 3);
  1032. }
  1033. if (n & 2)
  1034. {
  1035. pc0 = C;
  1036. pc1 = pc0 + 2 * ldc;
  1037. pa0 = A;
  1038. #if defined(TRMMKERNEL) && defined(LEFT)
  1039. off = offset;
  1040. #endif
  1041. for (i = (m >> 2); i--;)
  1042. {
  1043. #if defined(TRMMKERNEL)
  1044. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1045. pb0 = B;
  1046. #else
  1047. pa0 += off * 2 * 4;
  1048. pb0 = B + off * 2 * 2;
  1049. #endif
  1050. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1051. temp = k - off;
  1052. #elif defined(LEFT)
  1053. temp = off + 4; // number of values in A
  1054. #else
  1055. temp = off + 2; // number of values in B
  1056. #endif
  1057. #else
  1058. pb0 = B;
  1059. temp = k;
  1060. #endif
  1061. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1062. ZGEMM_KERNEL_4X2_MSA(, -, , +, +);
  1063. #endif
  1064. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1065. ZGEMM_KERNEL_4X2_MSA(, +, , +, -);
  1066. #endif
  1067. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1068. ZGEMM_KERNEL_4X2_MSA(, +, , -, +);
  1069. #endif
  1070. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1071. ZGEMM_KERNEL_4X2_MSA(, -, , -, -);
  1072. #endif
  1073. for (l = (temp - 1); l--;)
  1074. {
  1075. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1076. ZGEMM_KERNEL_4X2_MSA(+, -, +, +,);
  1077. #endif
  1078. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1079. ZGEMM_KERNEL_4X2_MSA(+, +, -, +,);
  1080. #endif
  1081. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1082. ZGEMM_KERNEL_4X2_MSA(+, +, +, -,);
  1083. #endif
  1084. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1085. ZGEMM_KERNEL_4X2_MSA(+, -, -, -,);
  1086. #endif
  1087. }
  1088. #if defined(TRMMKERNEL)
  1089. ZGEMM_TRMM_SCALE_4X2_MSA
  1090. #else
  1091. ZGEMM_SCALE_4X2_MSA
  1092. #endif
  1093. #if defined(TRMMKERNEL)
  1094. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1095. temp = k - off;
  1096. #ifdef LEFT
  1097. temp -= 4; // number of values in A
  1098. #else
  1099. temp -= 2; // number of values in B
  1100. #endif
  1101. pa0 += temp * 2 * 4;
  1102. pb0 += temp * 2 * 2;
  1103. #endif
  1104. #ifdef LEFT
  1105. off += 4; // number of values in A
  1106. #endif
  1107. #endif
  1108. }
  1109. if (m & 2)
  1110. {
  1111. #if defined(TRMMKERNEL)
  1112. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1113. pb0 = B;
  1114. #else
  1115. pa0 += off * 2 * 2;
  1116. pb0 = B + off * 2 * 2;
  1117. #endif
  1118. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1119. temp = k - off;
  1120. #elif defined(LEFT)
  1121. temp = off + 2; // number of values in A
  1122. #else
  1123. temp = off + 2; // number of values in B
  1124. #endif
  1125. #else
  1126. pb0 = B;
  1127. temp = k;
  1128. #endif
  1129. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1130. ZGEMM_KERNEL_2X2_MSA(, -, , +, +);
  1131. #endif
  1132. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1133. ZGEMM_KERNEL_2X2_MSA(, +, , +, -);
  1134. #endif
  1135. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1136. ZGEMM_KERNEL_2X2_MSA(, +, , -, +);
  1137. #endif
  1138. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1139. ZGEMM_KERNEL_2X2_MSA(, -, , -, -);
  1140. #endif
  1141. for (l = (temp - 1); l--;)
  1142. {
  1143. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1144. ZGEMM_KERNEL_2X2_MSA(+, -, +, +,);
  1145. #endif
  1146. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1147. ZGEMM_KERNEL_2X2_MSA(+, +, -, +,);
  1148. #endif
  1149. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1150. ZGEMM_KERNEL_2X2_MSA(+, +, +, -,);
  1151. #endif
  1152. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1153. ZGEMM_KERNEL_2X2_MSA(+, -, -, -,);
  1154. #endif
  1155. }
  1156. #if defined(TRMMKERNEL)
  1157. ZGEMM_TRMM_SCALE_2X2_MSA
  1158. #else
  1159. ZGEMM_SCALE_2X2_MSA
  1160. #endif
  1161. #if defined(TRMMKERNEL)
  1162. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1163. temp = k - off;
  1164. #ifdef LEFT
  1165. temp -= 2; // number of values in A
  1166. #else
  1167. temp -= 2; // number of values in B
  1168. #endif
  1169. pa0 += temp * 2 * 2;
  1170. pb0 += temp * 2 * 2;
  1171. #endif
  1172. #ifdef LEFT
  1173. off += 2; // number of values in A
  1174. #endif
  1175. #endif
  1176. }
  1177. if (m & 1)
  1178. {
  1179. #if defined(TRMMKERNEL)
  1180. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1181. pb0 = B;
  1182. #else
  1183. pa0 += off * 2 * 1;
  1184. pb0 = B + off * 2 * 2;
  1185. #endif
  1186. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1187. temp = k - off;
  1188. #elif defined(LEFT)
  1189. temp = off + 1; // number of values in A
  1190. #else
  1191. temp = off + 2; // number of values in B
  1192. #endif
  1193. #else
  1194. pb0 = B;
  1195. temp = k;
  1196. #endif
  1197. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1198. ZGEMM_KERNEL_1X2_MSA(, -, , +, +);
  1199. #endif
  1200. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1201. ZGEMM_KERNEL_1X2_MSA(, +, , +, -);
  1202. #endif
  1203. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1204. ZGEMM_KERNEL_1X2_MSA(, +, , -, +);
  1205. #endif
  1206. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1207. ZGEMM_KERNEL_1X2_MSA(, -, , -, -);
  1208. #endif
  1209. pa0 += 2;
  1210. for (l = (temp - 1); l--;)
  1211. {
  1212. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1213. ZGEMM_KERNEL_1X2_MSA(+, -, +, +,);
  1214. #endif
  1215. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1216. ZGEMM_KERNEL_1X2_MSA(+, +, -, +,);
  1217. #endif
  1218. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1219. ZGEMM_KERNEL_1X2_MSA(+, +, +, -,);
  1220. #endif
  1221. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1222. ZGEMM_KERNEL_1X2_MSA(+, -, -, -,);
  1223. #endif
  1224. pa0 += 2;
  1225. }
  1226. #if defined(TRMMKERNEL)
  1227. ZGEMM_TRMM_SCALE_1X2_MSA
  1228. #else
  1229. ZGEMM_SCALE_1X2_MSA
  1230. #endif
  1231. pc0 += 2;
  1232. pc1 += 2;
  1233. #if defined(TRMMKERNEL)
  1234. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1235. temp = k - off;
  1236. #ifdef LEFT
  1237. temp -= 1; // number of values in A
  1238. #else
  1239. temp -= 2; // number of values in B
  1240. #endif
  1241. pa0 += temp * 2 * 1;
  1242. pb0 += temp * 2 * 2;
  1243. #endif
  1244. #ifdef LEFT
  1245. off += 1; // number of values in A
  1246. #endif
  1247. #endif
  1248. }
  1249. #if defined(TRMMKERNEL) && !defined(LEFT)
  1250. off += 2; // number of values in A
  1251. #endif
  1252. B += (k << 2);
  1253. C += (ldc << 2);
  1254. }
  1255. if (n & 1)
  1256. {
  1257. pc0 = C;
  1258. pa0 = A;
  1259. #if defined(TRMMKERNEL) && defined(LEFT)
  1260. off = offset;
  1261. #endif
  1262. for (i = (m >> 2); i--;)
  1263. {
  1264. #if defined(TRMMKERNEL)
  1265. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1266. pb0 = B;
  1267. #else
  1268. pa0 += off * 2 * 4;
  1269. pb0 = B + off * 2 * 1;
  1270. #endif
  1271. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1272. temp = k - off;
  1273. #elif defined(LEFT)
  1274. temp = off + 4; // number of values in A
  1275. #else
  1276. temp = off + 1; // number of values in B
  1277. #endif
  1278. #else
  1279. pb0 = B;
  1280. temp = k;
  1281. #endif
  1282. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1283. ZGEMM_KERNEL_4X1_MSA(, -, , +, +);
  1284. #endif
  1285. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1286. ZGEMM_KERNEL_4X1_MSA(, +, , +, -);
  1287. #endif
  1288. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1289. ZGEMM_KERNEL_4X1_MSA(, +, , -, +);
  1290. #endif
  1291. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1292. ZGEMM_KERNEL_4X1_MSA(, -, , -, -);
  1293. #endif
  1294. pb0 += 2;
  1295. for (l = (temp - 1); l--;)
  1296. {
  1297. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1298. ZGEMM_KERNEL_4X1_MSA(+, -, +, +,);
  1299. #endif
  1300. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1301. ZGEMM_KERNEL_4X1_MSA(+, +, -, +,);
  1302. #endif
  1303. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1304. ZGEMM_KERNEL_4X1_MSA(+, +, +, -,);
  1305. #endif
  1306. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1307. ZGEMM_KERNEL_4X1_MSA(+, -, -, -,);
  1308. #endif
  1309. pb0 += 2;
  1310. }
  1311. #if defined(TRMMKERNEL)
  1312. ZGEMM_TRMM_SCALE_4X1_MSA
  1313. #else
  1314. ZGEMM_SCALE_4X1_MSA
  1315. #endif
  1316. #if defined(TRMMKERNEL)
  1317. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1318. temp = k - off;
  1319. #ifdef LEFT
  1320. temp -= 4; // number of values in A
  1321. #else
  1322. temp -= 1; // number of values in B
  1323. #endif
  1324. pa0 += temp * 2 * 4;
  1325. pb0 += temp * 2 * 1;
  1326. #endif
  1327. #ifdef LEFT
  1328. off += 4; // number of values in A
  1329. #endif
  1330. #endif
  1331. }
  1332. if (m & 2)
  1333. {
  1334. #if defined(TRMMKERNEL)
  1335. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1336. pb0 = B;
  1337. #else
  1338. pa0 += off * 2 * 2;
  1339. pb0 = B + off * 2 * 1;
  1340. #endif
  1341. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1342. temp = k - off;
  1343. #elif defined(LEFT)
  1344. temp = off + 2; // number of values in A
  1345. #else
  1346. temp = off + 1; // number of values in B
  1347. #endif
  1348. #else
  1349. pb0 = B;
  1350. temp = k;
  1351. #endif
  1352. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1353. ZGEMM_KERNEL_2X1_MSA(, -, , +, +);
  1354. #endif
  1355. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1356. ZGEMM_KERNEL_2X1_MSA(, +, , +, -);
  1357. #endif
  1358. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1359. ZGEMM_KERNEL_2X1_MSA(, +, , -, +);
  1360. #endif
  1361. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1362. ZGEMM_KERNEL_2X1_MSA(, -, , -, -);
  1363. #endif
  1364. pb0 += 2;
  1365. for (l = (temp - 1); l--;)
  1366. {
  1367. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1368. ZGEMM_KERNEL_2X1_MSA(+, -, +, +,);
  1369. #endif
  1370. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1371. ZGEMM_KERNEL_2X1_MSA(+, +, -, +,);
  1372. #endif
  1373. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1374. ZGEMM_KERNEL_2X1_MSA(+, +, +, -,);
  1375. #endif
  1376. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1377. ZGEMM_KERNEL_2X1_MSA(+, -, -, -,);
  1378. #endif
  1379. pb0 += 2;
  1380. }
  1381. #if defined(TRMMKERNEL)
  1382. ZGEMM_TRMM_SCALE_2X1_MSA
  1383. #else
  1384. ZGEMM_SCALE_2X1_MSA
  1385. #endif
  1386. #if defined(TRMMKERNEL)
  1387. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1388. temp = k - off;
  1389. #ifdef LEFT
  1390. temp -= 2; // number of values in A
  1391. #else
  1392. temp -= 1; // number of values in B
  1393. #endif
  1394. pa0 += temp * 2 * 2;
  1395. pb0 += temp * 2 * 1;
  1396. #endif
  1397. #ifdef LEFT
  1398. off += 2; // number of values in A
  1399. #endif
  1400. #endif
  1401. }
  1402. if (m & 1)
  1403. {
  1404. #if defined(TRMMKERNEL)
  1405. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1406. pb0 = B;
  1407. #else
  1408. pa0 += off * 2 * 1;
  1409. pb0 = B + off * 2 * 1;
  1410. #endif
  1411. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1412. temp = k - off;
  1413. #elif defined(LEFT)
  1414. temp = off + 1; // number of values in A
  1415. #else
  1416. temp = off + 1; // number of values in B
  1417. #endif
  1418. #else
  1419. pb0 = B;
  1420. temp = k;
  1421. #endif
  1422. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1423. ZGEMM_KERNEL_1X1(, -, , +, +);
  1424. #endif
  1425. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1426. ZGEMM_KERNEL_1X1(, +, , +, -);
  1427. #endif
  1428. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1429. ZGEMM_KERNEL_1X1(, +, , -, +);
  1430. #endif
  1431. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1432. ZGEMM_KERNEL_1X1(, -, , -, -);
  1433. #endif
  1434. pa0 += 2;
  1435. pb0 += 2;
  1436. for (l = (temp - 1); l--;)
  1437. {
  1438. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1439. ZGEMM_KERNEL_1X1(+, -, +, +,);
  1440. #endif
  1441. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1442. ZGEMM_KERNEL_1X1(+, +, -, +,);
  1443. #endif
  1444. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1445. ZGEMM_KERNEL_1X1(+, +, +, -,);
  1446. #endif
  1447. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1448. ZGEMM_KERNEL_1X1(+, -, -, -,);
  1449. #endif
  1450. pa0 += 2;
  1451. pb0 += 2;
  1452. }
  1453. #if defined(TRMMKERNEL)
  1454. ZGEMM_TRMM_SCALE_1X1
  1455. #else
  1456. ZGEMM_SCALE_1X1
  1457. #endif
  1458. pc0 += 2;
  1459. #if defined(TRMMKERNEL)
  1460. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1461. temp = k - off;
  1462. #ifdef LEFT
  1463. temp -= 1; // number of values in A
  1464. #else
  1465. temp -= 1; // number of values in B
  1466. #endif
  1467. pa0 += temp * 2 * 1;
  1468. pb0 += temp * 2 * 1;
  1469. #endif
  1470. #ifdef LEFT
  1471. off += 1; // number of values in A
  1472. #endif
  1473. #endif
  1474. }
  1475. #if defined(TRMMKERNEL) && !defined(LEFT)
  1476. off += 1; // number of values in A
  1477. #endif
  1478. B += (k << 1);
  1479. C += (ldc << 1);
  1480. }
  1481. return 0;
  1482. }