You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x4_msa.c 78 kB


  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. #define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \
  30. { \
  31. LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
  32. LD_SP2_INC(pb0, 4, src_b0, src_b1); \
  33. \
  34. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  35. PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
  36. \
  37. /* 0th col */ \
  38. SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
  39. res0_r OP0## = src_a0r * src_br; \
  40. res0_r OP1## = src_a0i * src_bi; \
  41. res0_i OP2## = (OP4 src_a0r) * src_bi; \
  42. res0_i OP3## = src_a0i * src_br; \
  43. \
  44. res1_r OP0## = src_a1r * src_br; \
  45. res1_r OP1## = src_a1i * src_bi; \
  46. res1_i OP2## = (OP4 src_a1r) * src_bi; \
  47. res1_i OP3## = src_a1i * src_br; \
  48. \
  49. /* 1st col */ \
  50. SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
  51. res2_r OP0## = src_a0r * src_br; \
  52. res2_r OP1## = src_a0i * src_bi; \
  53. res2_i OP2## = (OP4 src_a0r) * src_bi; \
  54. res2_i OP3## = src_a0i * src_br; \
  55. \
  56. res3_r OP0## = src_a1r * src_br; \
  57. res3_r OP1## = src_a1i * src_bi; \
  58. res3_i OP2## = (OP4 src_a1r) * src_bi; \
  59. res3_i OP3## = src_a1i * src_br; \
  60. \
  61. /* 2nd col */ \
  62. SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
  63. res4_r OP0## = src_a0r * src_br; \
  64. res4_r OP1## = src_a0i * src_bi; \
  65. res4_i OP2## = (OP4 src_a0r) * src_bi; \
  66. res4_i OP3## = src_a0i * src_br; \
  67. \
  68. res5_r OP0## = src_a1r * src_br; \
  69. res5_r OP1## = src_a1i * src_bi; \
  70. res5_i OP2## = (OP4 src_a1r) * src_bi; \
  71. res5_i OP3## = src_a1i * src_br; \
  72. \
  73. /* 3rd col */ \
  74. SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
  75. res6_r OP0## = src_a0r * src_br; \
  76. res6_r OP1## = src_a0i * src_bi; \
  77. res6_i OP2## = (OP4 src_a0r) * src_bi; \
  78. res6_i OP3## = src_a0i * src_br; \
  79. \
  80. res7_r OP0## = src_a1r * src_br; \
  81. res7_r OP1## = src_a1i * src_bi; \
  82. res7_i OP2## = (OP4 src_a1r) * src_bi; \
  83. res7_i OP3## = src_a1i * src_br; \
  84. }
  85. #define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \
  86. { \
  87. LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
  88. src_b0 = LD_SP(pb0); \
  89. \
  90. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  91. PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
  92. \
  93. /* 0th col */ \
  94. SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
  95. res0_r OP0## = src_a0r * src_br; \
  96. res0_r OP1## = src_a0i * src_bi; \
  97. res0_i OP2## = (OP4 src_a0r) * src_bi; \
  98. res0_i OP3## = src_a0i * src_br; \
  99. \
  100. res1_r OP0## = src_a1r * src_br; \
  101. res1_r OP1## = src_a1i * src_bi; \
  102. res1_i OP2## = (OP4 src_a1r) * src_bi; \
  103. res1_i OP3## = src_a1i * src_br; \
  104. \
  105. /* 1st col */ \
  106. SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
  107. res2_r OP0## = src_a0r * src_br; \
  108. res2_r OP1## = src_a0i * src_bi; \
  109. res2_i OP2## = (OP4 src_a0r) * src_bi; \
  110. res2_i OP3## = src_a0i * src_br; \
  111. \
  112. res3_r OP0## = src_a1r * src_br; \
  113. res3_r OP1## = src_a1i * src_bi; \
  114. res3_i OP2## = (OP4 src_a1r) * src_bi; \
  115. res3_i OP3## = src_a1i * src_br; \
  116. }
  117. #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \
  118. { \
  119. LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
  120. src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \
  121. SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
  122. \
  123. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  124. PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
  125. \
  126. /* 0th col */ \
  127. res0_r OP0## = src_a0r * src_br; \
  128. res0_r OP1## = src_a0i * src_bi; \
  129. res0_i OP2## = (OP4 src_a0r) * src_bi; \
  130. res0_i OP3## = src_a0i * src_br; \
  131. \
  132. res1_r OP0## = src_a1r * src_br; \
  133. res1_r OP1## = src_a1i * src_bi; \
  134. res1_i OP2## = (OP4 src_a1r) * src_bi; \
  135. res1_i OP3## = src_a1i * src_br; \
  136. }
  137. #define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
  138. { \
  139. LD_SP2_INC(pa0, 4, src_a0, src_a1); \
  140. LD_SP2_INC(pb0, 4, src_b0, src_b1); \
  141. \
  142. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  143. \
  144. /* 0th col */ \
  145. SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
  146. res0_r OP0## = src_a0r * src_br; \
  147. res0_r OP1## = src_a0i * src_bi; \
  148. res0_i OP2## = OP4 src_a0r * src_bi; \
  149. res0_i OP3## = src_a0i * src_br; \
  150. \
  151. /* 1st col */ \
  152. SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
  153. res2_r OP0## = src_a0r * src_br; \
  154. res2_r OP1## = src_a0i * src_bi; \
  155. res2_i OP2## = OP4 src_a0r * src_bi; \
  156. res2_i OP3## = src_a0i * src_br; \
  157. \
  158. /* 2nd col */ \
  159. SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
  160. res4_r OP0## = src_a0r * src_br; \
  161. res4_r OP1## = src_a0i * src_bi; \
  162. res4_i OP2## = OP4 src_a0r * src_bi; \
  163. res4_i OP3## = src_a0i * src_br; \
  164. \
  165. /* 3rd col */ \
  166. SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
  167. res6_r OP0## = src_a0r * src_br; \
  168. res6_r OP1## = src_a0i * src_bi; \
  169. res6_i OP2## = OP4 src_a0r * src_bi; \
  170. res6_i OP3## = src_a0i * src_br; \
  171. }
  172. #define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
  173. { \
  174. LD_SP2_INC(pa0, 4, src_a0, src_a1); \
  175. src_b0 = LD_SP(pb0); \
  176. \
  177. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  178. \
  179. /* 0th col */ \
  180. SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
  181. res0_r OP0## = src_a0r * src_br; \
  182. res0_r OP1## = src_a0i * src_bi; \
  183. res0_i OP2## = OP4 src_a0r * src_bi; \
  184. res0_i OP3## = src_a0i * src_br; \
  185. \
  186. /* 1st col */ \
  187. SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
  188. res2_r OP0## = src_a0r * src_br; \
  189. res2_r OP1## = src_a0i * src_bi; \
  190. res2_i OP2## = OP4 src_a0r * src_bi; \
  191. res2_i OP3## = src_a0i * src_br; \
  192. }
  193. #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
  194. { \
  195. LD_SP2_INC(pa0, 4, src_a0, src_a1); \
  196. src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \
  197. SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
  198. \
  199. PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
  200. \
  201. /* 0th col */ \
  202. res0_r OP0## = src_a0r * src_br; \
  203. res0_r OP1## = src_a0i * src_bi; \
  204. res0_i OP2## = OP4 src_a0r * src_bi; \
  205. res0_i OP3## = src_a0i * src_br; \
  206. }
  207. #define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \
  208. { \
  209. a0_r = pa0[0]; \
  210. a0_i = pa0[1]; \
  211. b0_r = pb0[0]; \
  212. b0_i = pb0[1]; \
  213. \
  214. res0 OP0## = a0_r * b0_r; \
  215. res0 OP1## = a0_i * b0_i; \
  216. res1 OP2## = OP4 a0_r * b0_i; \
  217. res1 OP3## = a0_i * b0_r; \
  218. \
  219. a1_r = pa0[2]; \
  220. a1_i = pa0[3]; \
  221. res2 OP0## = a1_r * b0_r; \
  222. res2 OP1## = a1_i * b0_i; \
  223. res3 OP2## = OP4 a1_r * b0_i; \
  224. res3 OP3## = a1_i * b0_r; \
  225. \
  226. /* 1st col */ \
  227. b1_r = pb0[2]; \
  228. b1_i = pb0[3]; \
  229. res4 OP0## = a0_r * b1_r; \
  230. res4 OP1## = a0_i * b1_i; \
  231. res5 OP2## = OP4 a0_r * b1_i; \
  232. res5 OP3## = a0_i * b1_r; \
  233. \
  234. res6 OP0## = a1_r * b1_r; \
  235. res6 OP1## = a1_i * b1_i; \
  236. res7 OP2## = OP4 a1_r * b1_i; \
  237. res7 OP3## = a1_i * b1_r; \
  238. \
  239. /* 2nd col */ \
  240. b2_r = pb0[4]; \
  241. b2_i = pb0[5]; \
  242. res8 OP0## = a0_r * b2_r; \
  243. res8 OP1## = a0_i * b2_i; \
  244. res9 OP2## = OP4 a0_r * b2_i; \
  245. res9 OP3## = a0_i * b2_r; \
  246. \
  247. res10 OP0## = a1_r * b2_r; \
  248. res10 OP1## = a1_i * b2_i; \
  249. res11 OP2## = OP4 a1_r * b2_i; \
  250. res11 OP3## = a1_i * b2_r; \
  251. \
  252. /* 3rd col */ \
  253. b3_r = pb0[6]; \
  254. b3_i = pb0[7]; \
  255. res12 OP0## = a0_r * b3_r; \
  256. res12 OP1## = a0_i * b3_i; \
  257. res13 OP2## = OP4 a0_r * b3_i; \
  258. res13 OP3## = a0_i * b3_r; \
  259. \
  260. res14 OP0## = a1_r * b3_r; \
  261. res14 OP1## = a1_i * b3_i; \
  262. res15 OP2## = OP4 a1_r * b3_i; \
  263. res15 OP3## = a1_i * b3_r; \
  264. }
  265. #define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \
  266. { \
  267. /* 0th col */ \
  268. a0_r = pa0[0]; \
  269. a0_i = pa0[1]; \
  270. b0_r = pb0[0]; \
  271. b0_i = pb0[1]; \
  272. \
  273. res0 OP0## = a0_r * b0_r; \
  274. res0 OP1## = a0_i * b0_i; \
  275. res1 OP2## = OP4 a0_r * b0_i; \
  276. res1 OP3## = a0_i * b0_r; \
  277. \
  278. a1_r = pa0[2]; \
  279. a1_i = pa0[3]; \
  280. res2 OP0## = a1_r * b0_r; \
  281. res2 OP1## = a1_i * b0_i; \
  282. res3 OP2## = OP4 a1_r * b0_i; \
  283. res3 OP3## = a1_i * b0_r; \
  284. \
  285. /* 1st col */ \
  286. b1_r = pb0[2]; \
  287. b1_i = pb0[3]; \
  288. res4 OP0## = a0_r * b1_r; \
  289. res4 OP1## = a0_i * b1_i; \
  290. res5 OP2## = OP4 a0_r * b1_i; \
  291. res5 OP3## = a0_i * b1_r; \
  292. \
  293. res6 OP0## = a1_r * b1_r; \
  294. res6 OP1## = a1_i * b1_i; \
  295. res7 OP2## = OP4 a1_r * b1_i; \
  296. res7 OP3## = a1_i * b1_r; \
  297. }
  298. #define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \
  299. { \
  300. /* 0th col */ \
  301. a0_r = pa0[0]; \
  302. a0_i = pa0[1]; \
  303. b0_r = pb0[0]; \
  304. b0_i = pb0[1]; \
  305. \
  306. res0 OP0## = a0_r * b0_r; \
  307. res0 OP1## = a0_i * b0_i; \
  308. res1 OP2## = OP4 a0_r * b0_i; \
  309. res1 OP3## = a0_i * b0_r; \
  310. \
  311. a1_r = pa0[2]; \
  312. a1_i = pa0[3]; \
  313. res2 OP0## = a1_r * b0_r; \
  314. res2 OP1## = a1_i * b0_i; \
  315. res3 OP2## = OP4 a1_r * b0_i; \
  316. res3 OP3## = a1_i * b0_r; \
  317. }
  318. #define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \
  319. { \
  320. /* 0th col */ \
  321. a0_r = pa0[0]; \
  322. a0_i = pa0[1]; \
  323. b0_r = pb0[0]; \
  324. b0_i = pb0[1]; \
  325. \
  326. res0 OP0## = a0_r * b0_r; \
  327. res0 OP1## = a0_i * b0_i; \
  328. res1 OP2## = OP4 a0_r * b0_i; \
  329. res1 OP3## = a0_i * b0_r; \
  330. \
  331. /* 1st col */ \
  332. b1_r = pb0[2]; \
  333. b1_i = pb0[3]; \
  334. res2 OP0## = a0_r * b1_r; \
  335. res2 OP1## = a0_i * b1_i; \
  336. res3 OP2## = OP4 a0_r * b1_i; \
  337. res3 OP3## = a0_i * b1_r; \
  338. \
  339. /* 2nd col */ \
  340. b2_r = pb0[4]; \
  341. b2_i = pb0[5]; \
  342. res4 OP0## = a0_r * b2_r; \
  343. res4 OP1## = a0_i * b2_i; \
  344. res5 OP2## = OP4 a0_r * b2_i; \
  345. res5 OP3## = a0_i * b2_r; \
  346. \
  347. /* 3rd col */ \
  348. b3_r = pb0[6]; \
  349. b3_i = pb0[7]; \
  350. res6 OP0## = a0_r * b3_r; \
  351. res6 OP1## = a0_i * b3_i; \
  352. res7 OP2## = OP4 a0_r * b3_i; \
  353. res7 OP3## = a0_i * b3_r; \
  354. }
  355. #define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \
  356. { \
  357. /* 0th col */ \
  358. a0_r = pa0[0]; \
  359. a0_i = pa0[1]; \
  360. b0_r = pb0[0]; \
  361. b0_i = pb0[1]; \
  362. \
  363. res0 OP0## = a0_r * b0_r; \
  364. res0 OP1## = a0_i * b0_i; \
  365. res1 OP2## = OP4 a0_r * b0_i; \
  366. res1 OP3## = a0_i * b0_r; \
  367. \
  368. /* 1st col */ \
  369. b1_r = pb0[2]; \
  370. b1_i = pb0[3]; \
  371. res2 OP0## = a0_r * b1_r; \
  372. res2 OP1## = a0_i * b1_i; \
  373. res3 OP2## = OP4 a0_r * b1_i; \
  374. res3 OP3## = a0_i * b1_r; \
  375. }
  376. #define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
  377. { \
  378. /* 0th col */ \
  379. a0_r = pa0[0]; \
  380. a0_i = pa0[1]; \
  381. b0_r = pb0[0]; \
  382. b0_i = pb0[1]; \
  383. \
  384. res0 OP0## = a0_r * b0_r; \
  385. res0 OP1## = a0_i * b0_i; \
  386. res1 OP2## = OP4 a0_r * b0_i; \
  387. res1 OP3## = a0_i * b0_r; \
  388. }
  389. #define CGEMM_SCALE_8X4_MSA \
  390. { \
  391. LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
  392. \
  393. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  394. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  395. \
  396. dst0_r += alpha_r * res0_r; \
  397. dst0_r -= alpha_i * res0_i; \
  398. dst0_i += alpha_r * res0_i; \
  399. dst0_i += alpha_i * res0_r; \
  400. \
  401. dst1_r += alpha_r * res1_r; \
  402. dst1_r -= alpha_i * res1_i; \
  403. dst1_i += alpha_r * res1_i; \
  404. dst1_i += alpha_i * res1_r; \
  405. \
  406. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  407. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  408. \
  409. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  410. \
  411. LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
  412. \
  413. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  414. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  415. \
  416. dst0_r += alpha_r * res2_r; \
  417. dst0_r -= alpha_i * res2_i; \
  418. dst0_i += alpha_r * res2_i; \
  419. dst0_i += alpha_i * res2_r; \
  420. \
  421. dst1_r += alpha_r * res3_r; \
  422. dst1_r -= alpha_i * res3_i; \
  423. dst1_i += alpha_r * res3_i; \
  424. dst1_i += alpha_i * res3_r; \
  425. \
  426. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  427. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  428. \
  429. ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
  430. \
  431. LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \
  432. \
  433. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  434. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  435. \
  436. dst0_r += alpha_r * res4_r; \
  437. dst0_r -= alpha_i * res4_i; \
  438. dst0_i += alpha_r * res4_i; \
  439. dst0_i += alpha_i * res4_r; \
  440. \
  441. dst1_r += alpha_r * res5_r; \
  442. dst1_r -= alpha_i * res5_i; \
  443. dst1_i += alpha_r * res5_i; \
  444. dst1_i += alpha_i * res5_r; \
  445. \
  446. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  447. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  448. \
  449. ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
  450. \
  451. LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \
  452. \
  453. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  454. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  455. \
  456. dst0_r += alpha_r * res6_r; \
  457. dst0_r -= alpha_i * res6_i; \
  458. dst0_i += alpha_r * res6_i; \
  459. dst0_i += alpha_i * res6_r; \
  460. \
  461. dst1_r += alpha_r * res7_r; \
  462. dst1_r -= alpha_i * res7_i; \
  463. dst1_i += alpha_r * res7_i; \
  464. dst1_i += alpha_i * res7_r; \
  465. \
  466. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  467. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  468. \
  469. ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
  470. }
  471. #define CGEMM_SCALE_8X2_MSA \
  472. { \
  473. LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
  474. \
  475. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  476. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  477. \
  478. dst0_r += alpha_r * res0_r; \
  479. dst0_r -= alpha_i * res0_i; \
  480. dst0_i += alpha_r * res0_i; \
  481. dst0_i += alpha_i * res0_r; \
  482. \
  483. dst1_r += alpha_r * res1_r; \
  484. dst1_r -= alpha_i * res1_i; \
  485. dst1_i += alpha_r * res1_i; \
  486. dst1_i += alpha_i * res1_r; \
  487. \
  488. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  489. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  490. \
  491. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  492. \
  493. LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
  494. \
  495. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  496. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  497. \
  498. dst0_r += alpha_r * res2_r; \
  499. dst0_r -= alpha_i * res2_i; \
  500. dst0_i += alpha_r * res2_i; \
  501. dst0_i += alpha_i * res2_r; \
  502. \
  503. dst1_r += alpha_r * res3_r; \
  504. dst1_r -= alpha_i * res3_i; \
  505. dst1_i += alpha_r * res3_i; \
  506. dst1_i += alpha_i * res3_r; \
  507. \
  508. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  509. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  510. \
  511. ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
  512. }
  513. #define CGEMM_SCALE_8X1_MSA \
  514. { \
  515. LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
  516. \
  517. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  518. PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
  519. \
  520. dst0_r += alpha_r * res0_r; \
  521. dst0_r -= alpha_i * res0_i; \
  522. dst0_i += alpha_r * res0_i; \
  523. dst0_i += alpha_i * res0_r; \
  524. \
  525. dst1_r += alpha_r * res1_r; \
  526. dst1_r -= alpha_i * res1_i; \
  527. dst1_i += alpha_r * res1_i; \
  528. dst1_i += alpha_i * res1_r; \
  529. \
  530. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  531. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  532. \
  533. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  534. }
  535. #define CGEMM_SCALE_4X4_MSA \
  536. { \
  537. LD_SP2(pc0, 4, dst0, dst1); \
  538. \
  539. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  540. \
  541. dst0_r += alpha_r * res0_r; \
  542. dst0_r -= alpha_i * res0_i; \
  543. dst0_i += alpha_r * res0_i; \
  544. dst0_i += alpha_i * res0_r; \
  545. \
  546. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  547. \
  548. ST_SP2_INC(dst0, dst1, pc0, 4); \
  549. \
  550. LD_SP2(pc1, 4, dst0, dst1); \
  551. \
  552. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  553. \
  554. dst0_r += alpha_r * res2_r; \
  555. dst0_r -= alpha_i * res2_i; \
  556. dst0_i += alpha_r * res2_i; \
  557. dst0_i += alpha_i * res2_r; \
  558. \
  559. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  560. \
  561. ST_SP2_INC(dst0, dst1, pc1, 4); \
  562. \
  563. LD_SP2(pc2, 4, dst0, dst1); \
  564. \
  565. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  566. \
  567. dst0_r += alpha_r * res4_r; \
  568. dst0_r -= alpha_i * res4_i; \
  569. dst0_i += alpha_r * res4_i; \
  570. dst0_i += alpha_i * res4_r; \
  571. \
  572. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  573. \
  574. ST_SP2_INC(dst0, dst1, pc2, 4); \
  575. \
  576. LD_SP2(pc3, 4, dst0, dst1); \
  577. \
  578. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  579. \
  580. dst0_r += alpha_r * res6_r; \
  581. dst0_r -= alpha_i * res6_i; \
  582. dst0_i += alpha_r * res6_i; \
  583. dst0_i += alpha_i * res6_r; \
  584. \
  585. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  586. \
  587. ST_SP2_INC(dst0, dst1, pc3, 4); \
  588. }
  589. #define CGEMM_SCALE_4X2_MSA \
  590. { \
  591. LD_SP2(pc0, 4, dst0, dst1); \
  592. \
  593. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  594. \
  595. dst0_r += alpha_r * res0_r; \
  596. dst0_r -= alpha_i * res0_i; \
  597. dst0_i += alpha_r * res0_i; \
  598. dst0_i += alpha_i * res0_r; \
  599. \
  600. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  601. \
  602. ST_SP2_INC(dst0, dst1, pc0, 4); \
  603. \
  604. LD_SP2(pc1, 4, dst0, dst1); \
  605. \
  606. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  607. \
  608. dst0_r += alpha_r * res2_r; \
  609. dst0_r -= alpha_i * res2_i; \
  610. dst0_i += alpha_r * res2_i; \
  611. dst0_i += alpha_i * res2_r; \
  612. \
  613. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  614. \
  615. ST_SP2_INC(dst0, dst1, pc1, 4); \
  616. }
  617. #define CGEMM_SCALE_4X1_MSA \
  618. { \
  619. LD_SP2(pc0, 4, dst0, dst1); \
  620. \
  621. PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
  622. \
  623. dst0_r += alpha_r * res0_r; \
  624. dst0_r -= alpha_i * res0_i; \
  625. dst0_i += alpha_r * res0_i; \
  626. dst0_i += alpha_i * res0_r; \
  627. \
  628. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  629. \
  630. ST_SP2_INC(dst0, dst1, pc0, 4); \
  631. }
  632. #define CGEMM_SCALE_2X4 \
  633. { \
  634. /* 0th col */ \
  635. pc0[0] += alphar * res0; \
  636. pc0[0] -= alphai * res1; \
  637. pc0[1] += alphar * res1; \
  638. pc0[1] += alphai * res0; \
  639. pc0[2] += alphar * res2; \
  640. pc0[2] -= alphai * res3; \
  641. pc0[3] += alphar * res3; \
  642. pc0[3] += alphai * res2; \
  643. \
  644. /* 1st col */ \
  645. pc1[0] += alphar * res4; \
  646. pc1[0] -= alphai * res5; \
  647. pc1[1] += alphar * res5; \
  648. pc1[1] += alphai * res4; \
  649. pc1[2] += alphar * res6; \
  650. pc1[2] -= alphai * res7; \
  651. pc1[3] += alphar * res7; \
  652. pc1[3] += alphai * res6; \
  653. \
  654. /* 2nd col */ \
  655. pc2[0] += alphar * res8; \
  656. pc2[0] -= alphai * res9; \
  657. pc2[1] += alphar * res9; \
  658. pc2[1] += alphai * res8; \
  659. pc2[2] += alphar * res10; \
  660. pc2[2] -= alphai * res11; \
  661. pc2[3] += alphar * res11; \
  662. pc2[3] += alphai * res10; \
  663. \
  664. /* 3rd col */ \
  665. pc3[0] += alphar * res12; \
  666. pc3[0] -= alphai * res13; \
  667. pc3[1] += alphar * res13; \
  668. pc3[1] += alphai * res12; \
  669. pc3[2] += alphar * res14; \
  670. pc3[2] -= alphai * res15; \
  671. pc3[3] += alphar * res15; \
  672. pc3[3] += alphai * res14; \
  673. }
  674. #define CGEMM_SCALE_2X2 \
  675. { \
  676. /* 0th col */ \
  677. pc0[0] += alphar * res0; \
  678. pc0[0] -= alphai * res1; \
  679. pc0[1] += alphar * res1; \
  680. pc0[1] += alphai * res0; \
  681. pc0[2] += alphar * res2; \
  682. pc0[2] -= alphai * res3; \
  683. pc0[3] += alphar * res3; \
  684. pc0[3] += alphai * res2; \
  685. \
  686. /* 1st col */ \
  687. pc1[0] += alphar * res4; \
  688. pc1[0] -= alphai * res5; \
  689. pc1[1] += alphar * res5; \
  690. pc1[1] += alphai * res4; \
  691. pc1[2] += alphar * res6; \
  692. pc1[2] -= alphai * res7; \
  693. pc1[3] += alphar * res7; \
  694. pc1[3] += alphai * res6; \
  695. }
  696. #define CGEMM_SCALE_2X1 \
  697. { \
  698. pc0[0] += alphar * res0; \
  699. pc0[0] -= alphai * res1; \
  700. pc0[1] += alphar * res1; \
  701. pc0[1] += alphai * res0; \
  702. \
  703. pc0[2] += alphar * res2; \
  704. pc0[2] -= alphai * res3; \
  705. pc0[3] += alphar * res3; \
  706. pc0[3] += alphai * res2; \
  707. }
  708. #define CGEMM_SCALE_1X4 \
  709. { \
  710. pc0[0] += alphar * res0; \
  711. pc0[0] -= alphai * res1; \
  712. pc0[1] += alphar * res1; \
  713. pc0[1] += alphai * res0; \
  714. \
  715. pc1[0] += alphar * res2; \
  716. pc1[0] -= alphai * res3; \
  717. pc1[1] += alphar * res3; \
  718. pc1[1] += alphai * res2; \
  719. \
  720. pc2[0] += alphar * res4; \
  721. pc2[0] -= alphai * res5; \
  722. pc2[1] += alphar * res5; \
  723. pc2[1] += alphai * res4; \
  724. \
  725. pc3[0] += alphar * res6; \
  726. pc3[0] -= alphai * res7; \
  727. pc3[1] += alphar * res7; \
  728. pc3[1] += alphai * res6; \
  729. }
  730. #define CGEMM_SCALE_1X2 \
  731. { \
  732. pc0[0] += alphar * res0; \
  733. pc0[0] -= alphai * res1; \
  734. pc0[1] += alphar * res1; \
  735. pc0[1] += alphai * res0; \
  736. \
  737. pc1[0] += alphar * res2; \
  738. pc1[0] -= alphai * res3; \
  739. pc1[1] += alphar * res3; \
  740. pc1[1] += alphai * res2; \
  741. }
  742. #define CGEMM_SCALE_1X1 \
  743. { \
  744. pc0[0] += alphar * res0; \
  745. pc0[0] -= alphai * res1; \
  746. pc0[1] += alphar * res1; \
  747. pc0[1] += alphai * res0; \
  748. }
  749. #define CGEMM_TRMM_SCALE_8X4_MSA \
  750. { \
  751. dst0_r = alpha_r * res0_r; \
  752. dst0_r -= alpha_i * res0_i; \
  753. dst0_i = alpha_r * res0_i; \
  754. dst0_i += alpha_i * res0_r; \
  755. \
  756. dst1_r = alpha_r * res1_r; \
  757. dst1_r -= alpha_i * res1_i; \
  758. dst1_i = alpha_r * res1_i; \
  759. dst1_i += alpha_i * res1_r; \
  760. \
  761. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  762. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  763. \
  764. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  765. \
  766. dst0_r = alpha_r * res2_r; \
  767. dst0_r -= alpha_i * res2_i; \
  768. dst0_i = alpha_r * res2_i; \
  769. dst0_i += alpha_i * res2_r; \
  770. \
  771. dst1_r = alpha_r * res3_r; \
  772. dst1_r -= alpha_i * res3_i; \
  773. dst1_i = alpha_r * res3_i; \
  774. dst1_i += alpha_i * res3_r; \
  775. \
  776. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  777. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  778. \
  779. ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
  780. \
  781. dst0_r = alpha_r * res4_r; \
  782. dst0_r -= alpha_i * res4_i; \
  783. dst0_i = alpha_r * res4_i; \
  784. dst0_i += alpha_i * res4_r; \
  785. \
  786. dst1_r = alpha_r * res5_r; \
  787. dst1_r -= alpha_i * res5_i; \
  788. dst1_i = alpha_r * res5_i; \
  789. dst1_i += alpha_i * res5_r; \
  790. \
  791. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  792. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  793. \
  794. ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
  795. \
  796. dst0_r = alpha_r * res6_r; \
  797. dst0_r -= alpha_i * res6_i; \
  798. dst0_i = alpha_r * res6_i; \
  799. dst0_i += alpha_i * res6_r; \
  800. \
  801. dst1_r = alpha_r * res7_r; \
  802. dst1_r -= alpha_i * res7_i; \
  803. dst1_i = alpha_r * res7_i; \
  804. dst1_i += alpha_i * res7_r; \
  805. \
  806. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  807. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  808. \
  809. ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
  810. }
  811. #define CGEMM_TRMM_SCALE_8X2_MSA \
  812. { \
  813. dst0_r = alpha_r * res0_r; \
  814. dst0_r -= alpha_i * res0_i; \
  815. dst0_i = alpha_r * res0_i; \
  816. dst0_i += alpha_i * res0_r; \
  817. \
  818. dst1_r = alpha_r * res1_r; \
  819. dst1_r -= alpha_i * res1_i; \
  820. dst1_i = alpha_r * res1_i; \
  821. dst1_i += alpha_i * res1_r; \
  822. \
  823. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  824. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  825. \
  826. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  827. \
  828. dst0_r = alpha_r * res2_r; \
  829. dst0_r -= alpha_i * res2_i; \
  830. dst0_i = alpha_r * res2_i; \
  831. dst0_i += alpha_i * res2_r; \
  832. \
  833. dst1_r = alpha_r * res3_r; \
  834. dst1_r -= alpha_i * res3_i; \
  835. dst1_i = alpha_r * res3_i; \
  836. dst1_i += alpha_i * res3_r; \
  837. \
  838. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  839. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  840. \
  841. ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
  842. }
  843. #define CGEMM_TRMM_SCALE_8X1_MSA \
  844. { \
  845. dst0_r = alpha_r * res0_r; \
  846. dst0_r -= alpha_i * res0_i; \
  847. dst0_i = alpha_r * res0_i; \
  848. dst0_i += alpha_i * res0_r; \
  849. \
  850. dst1_r = alpha_r * res1_r; \
  851. dst1_r -= alpha_i * res1_i; \
  852. dst1_i = alpha_r * res1_i; \
  853. dst1_i += alpha_i * res1_r; \
  854. \
  855. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  856. ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
  857. \
  858. ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
  859. }
  860. #define CGEMM_TRMM_SCALE_4X4_MSA \
  861. { \
  862. dst0_r = alpha_r * res0_r; \
  863. dst0_r -= alpha_i * res0_i; \
  864. dst0_i = alpha_r * res0_i; \
  865. dst0_i += alpha_i * res0_r; \
  866. \
  867. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  868. \
  869. ST_SP2_INC(dst0, dst1, pc0, 4); \
  870. \
  871. dst0_r = alpha_r * res2_r; \
  872. dst0_r -= alpha_i * res2_i; \
  873. dst0_i = alpha_r * res2_i; \
  874. dst0_i += alpha_i * res2_r; \
  875. \
  876. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  877. \
  878. ST_SP2_INC(dst0, dst1, pc1, 4); \
  879. \
  880. dst0_r = alpha_r * res4_r; \
  881. dst0_r -= alpha_i * res4_i; \
  882. dst0_i = alpha_r * res4_i; \
  883. dst0_i += alpha_i * res4_r; \
  884. \
  885. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  886. \
  887. ST_SP2_INC(dst0, dst1, pc2, 4); \
  888. \
  889. dst0_r = alpha_r * res6_r; \
  890. dst0_r -= alpha_i * res6_i; \
  891. dst0_i = alpha_r * res6_i; \
  892. dst0_i += alpha_i * res6_r; \
  893. \
  894. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  895. \
  896. ST_SP2_INC(dst0, dst1, pc3, 4); \
  897. }
  898. #define CGEMM_TRMM_SCALE_4X2_MSA \
  899. { \
  900. dst0_r = alpha_r * res0_r; \
  901. dst0_r -= alpha_i * res0_i; \
  902. dst0_i = alpha_r * res0_i; \
  903. dst0_i += alpha_i * res0_r; \
  904. \
  905. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  906. \
  907. ST_SP2_INC(dst0, dst1, pc0, 4); \
  908. \
  909. dst0_r = alpha_r * res2_r; \
  910. dst0_r -= alpha_i * res2_i; \
  911. dst0_i = alpha_r * res2_i; \
  912. dst0_i += alpha_i * res2_r; \
  913. \
  914. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  915. \
  916. ST_SP2_INC(dst0, dst1, pc1, 4); \
  917. }
  918. #define CGEMM_TRMM_SCALE_4X1_MSA \
  919. { \
  920. dst0_r = alpha_r * res0_r; \
  921. dst0_r -= alpha_i * res0_i; \
  922. dst0_i = alpha_r * res0_i; \
  923. dst0_i += alpha_i * res0_r; \
  924. \
  925. ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
  926. \
  927. ST_SP2_INC(dst0, dst1, pc0, 4); \
  928. }
  929. #define CGEMM_TRMM_SCALE_2X4 \
  930. { \
  931. /* 0th col */ \
  932. pc0[0] = alphar * res0; \
  933. pc0[0] -= alphai * res1; \
  934. pc0[1] = alphar * res1; \
  935. pc0[1] += alphai * res0; \
  936. pc0[2] = alphar * res2; \
  937. pc0[2] -= alphai * res3; \
  938. pc0[3] = alphar * res3; \
  939. pc0[3] += alphai * res2; \
  940. \
  941. /* 1st col */ \
  942. pc1[0] = alphar * res4; \
  943. pc1[0] -= alphai * res5; \
  944. pc1[1] = alphar * res5; \
  945. pc1[1] += alphai * res4; \
  946. pc1[2] = alphar * res6; \
  947. pc1[2] -= alphai * res7; \
  948. pc1[3] = alphar * res7; \
  949. pc1[3] += alphai * res6; \
  950. \
  951. /* 2nd col */ \
  952. pc2[0] = alphar * res8; \
  953. pc2[0] -= alphai * res9; \
  954. pc2[1] = alphar * res9; \
  955. pc2[1] += alphai * res8; \
  956. pc2[2] = alphar * res10; \
  957. pc2[2] -= alphai * res11; \
  958. pc2[3] = alphar * res11; \
  959. pc2[3] += alphai * res10; \
  960. \
  961. /* 3rd col */ \
  962. pc3[0] = alphar * res12; \
  963. pc3[0] -= alphai * res13; \
  964. pc3[1] = alphar * res13; \
  965. pc3[1] += alphai * res12; \
  966. pc3[2] = alphar * res14; \
  967. pc3[2] -= alphai * res15; \
  968. pc3[3] = alphar * res15; \
  969. pc3[3] += alphai * res14; \
  970. }
  971. #define CGEMM_TRMM_SCALE_2X2 \
  972. { \
  973. /* 0th col */ \
  974. pc0[0] = alphar * res0; \
  975. pc0[0] -= alphai * res1; \
  976. pc0[1] = alphar * res1; \
  977. pc0[1] += alphai * res0; \
  978. pc0[2] = alphar * res2; \
  979. pc0[2] -= alphai * res3; \
  980. pc0[3] = alphar * res3; \
  981. pc0[3] += alphai * res2; \
  982. \
  983. /* 1st col */ \
  984. pc1[0] = alphar * res4; \
  985. pc1[0] -= alphai * res5; \
  986. pc1[1] = alphar * res5; \
  987. pc1[1] += alphai * res4; \
  988. pc1[2] = alphar * res6; \
  989. pc1[2] -= alphai * res7; \
  990. pc1[3] = alphar * res7; \
  991. pc1[3] += alphai * res6; \
  992. }
  993. #define CGEMM_TRMM_SCALE_2X1 \
  994. { \
  995. pc0[0] = alphar * res0; \
  996. pc0[0] -= alphai * res1; \
  997. pc0[1] = alphar * res1; \
  998. pc0[1] += alphai * res0; \
  999. \
  1000. pc0[2] = alphar * res2; \
  1001. pc0[2] -= alphai * res3; \
  1002. pc0[3] = alphar * res3; \
  1003. pc0[3] += alphai * res2; \
  1004. }
  1005. #define CGEMM_TRMM_SCALE_1X4 \
  1006. { \
  1007. pc0[0] = alphar * res0; \
  1008. pc0[0] -= alphai * res1; \
  1009. pc0[1] = alphar * res1; \
  1010. pc0[1] += alphai * res0; \
  1011. \
  1012. pc1[0] = alphar * res2; \
  1013. pc1[0] -= alphai * res3; \
  1014. pc1[1] = alphar * res3; \
  1015. pc1[1] += alphai * res2; \
  1016. \
  1017. pc2[0] = alphar * res4; \
  1018. pc2[0] -= alphai * res5; \
  1019. pc2[1] = alphar * res5; \
  1020. pc2[1] += alphai * res4; \
  1021. \
  1022. pc3[0] = alphar * res6; \
  1023. pc3[0] -= alphai * res7; \
  1024. pc3[1] = alphar * res7; \
  1025. pc3[1] += alphai * res6; \
  1026. }
  1027. #define CGEMM_TRMM_SCALE_1X2 \
  1028. { \
  1029. pc0[0] = alphar * res0; \
  1030. pc0[0] -= alphai * res1; \
  1031. pc0[1] = alphar * res1; \
  1032. pc0[1] += alphai * res0; \
  1033. \
  1034. pc1[0] = alphar * res2; \
  1035. pc1[0] -= alphai * res3; \
  1036. pc1[1] = alphar * res3; \
  1037. pc1[1] += alphai * res2; \
  1038. }
  1039. #define CGEMM_TRMM_SCALE_1X1 \
  1040. { \
  1041. pc0[0] = alphar * res0; \
  1042. pc0[0] -= alphai * res1; \
  1043. pc0[1] = alphar * res1; \
  1044. pc0[1] += alphai * res0; \
  1045. }
  1046. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
  1047. FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
  1048. #ifdef TRMMKERNEL
  1049. , BLASLONG offset
  1050. #endif
  1051. )
  1052. {
  1053. BLASLONG i, j, l, temp;
  1054. #if defined(TRMMKERNEL)
  1055. BLASLONG off;
  1056. #endif
  1057. FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
  1058. FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
  1059. FLOAT res8, res9, res10, res11, res12, res13, res14, res15;
  1060. FLOAT a0_r, a1_r, a0_i, a1_i, b0_i, b1_i, b2_i, b3_i;
  1061. FLOAT b0_r, b1_r, b2_r, b3_r;
  1062. v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1;
  1063. v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
  1064. v4f32 dst0, dst1, dst2, dst3, alpha_r, alpha_i;
  1065. v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
  1066. v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
  1067. v4f32 dst0_r, dst0_i, dst1_r, dst1_i;
  1068. alpha_r = COPY_FLOAT_TO_VECTOR(alphar);
  1069. alpha_i = COPY_FLOAT_TO_VECTOR(alphai);
  1070. #if defined(TRMMKERNEL) && !defined(LEFT)
  1071. off = -offset;
  1072. #endif
  1073. for (j = (n >> 2); j--;)
  1074. {
  1075. pc0 = C;
  1076. pc1 = pc0 + 2 * ldc;
  1077. pc2 = pc1 + 2 * ldc;
  1078. pc3 = pc2 + 2 * ldc;
  1079. #if defined(TRMMKERNEL) && defined(LEFT)
  1080. off = offset;
  1081. #endif
  1082. pa0 = A;
  1083. for (i = (m >> 3); i--;)
  1084. {
  1085. #if defined(TRMMKERNEL)
  1086. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1087. pb0 = B;
  1088. #else
  1089. pa0 += off * 2 * 8;
  1090. pb0 = B + off * 2 * 4;
  1091. #endif
  1092. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1093. temp = k - off;
  1094. #elif defined(LEFT)
  1095. temp = off + 8; // number of values in A
  1096. #else
  1097. temp = off + 4; // number of values in B
  1098. #endif
  1099. #else
  1100. pb0 = B;
  1101. temp = k;
  1102. #endif
  1103. #ifdef ENABLE_PREFETCH
  1104. __asm__ __volatile__(
  1105. "pref 0, 64(%[pa0]) \n\t"
  1106. "pref 0, 96(%[pa0]) \n\t"
  1107. "pref 0, 32(%[pb0]) \n\t"
  1108. :
  1109. : [pa0] "r" (pa0), [pb0] "r" (pb0)
  1110. );
  1111. #endif
  1112. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1113. CGEMM_KERNEL_8X4_MSA(, -, , +, +);
  1114. #endif
  1115. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1116. CGEMM_KERNEL_8X4_MSA(, +, , +, -);
  1117. #endif
  1118. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1119. CGEMM_KERNEL_8X4_MSA(, +, , -, +);
  1120. #endif
  1121. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1122. CGEMM_KERNEL_8X4_MSA(, -, , -, -);
  1123. #endif
  1124. for (l = (temp - 1); l--;)
  1125. {
  1126. #ifdef ENABLE_PREFETCH
  1127. __asm__ __volatile__(
  1128. "pref 0, 64(%[pa0]) \n\t"
  1129. "pref 0, 96(%[pa0]) \n\t"
  1130. "pref 0, 32(%[pb0]) \n\t"
  1131. :
  1132. : [pa0] "r" (pa0), [pb0] "r" (pb0)
  1133. );
  1134. #endif
  1135. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1136. CGEMM_KERNEL_8X4_MSA(+, -, +, +,);
  1137. #endif
  1138. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1139. CGEMM_KERNEL_8X4_MSA(+, +, -, +,);
  1140. #endif
  1141. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1142. CGEMM_KERNEL_8X4_MSA(+, +, +, -,);
  1143. #endif
  1144. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1145. CGEMM_KERNEL_8X4_MSA(+, -, -, -,);
  1146. #endif
  1147. }
  1148. #if defined(TRMMKERNEL)
  1149. CGEMM_TRMM_SCALE_8X4_MSA
  1150. #else
  1151. CGEMM_SCALE_8X4_MSA
  1152. #endif
  1153. #if defined(TRMMKERNEL)
  1154. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1155. temp = k - off;
  1156. #ifdef LEFT
  1157. temp -= 8; // number of values in A
  1158. #else
  1159. temp -= 4; // number of values in B
  1160. #endif
  1161. pa0 += temp * 2 * 8;
  1162. pb0 += temp * 2 * 4;
  1163. #endif
  1164. #ifdef LEFT
  1165. off += 8; // number of values in A
  1166. #endif
  1167. #endif
  1168. }
  1169. if (m & 4)
  1170. {
  1171. #if defined(TRMMKERNEL)
  1172. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1173. pb0 = B;
  1174. #else
  1175. pa0 += off * 2 * 4;
  1176. pb0 = B + off * 2 * 4;
  1177. #endif
  1178. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1179. temp = k - off;
  1180. #elif defined(LEFT)
  1181. temp = off + 4; // number of values in A
  1182. #else
  1183. temp = off + 4; // number of values in B
  1184. #endif
  1185. #else
  1186. pb0 = B;
  1187. temp = k;
  1188. #endif
  1189. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1190. CGEMM_KERNEL_4X4_MSA(, -, , +, +);
  1191. #endif
  1192. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1193. CGEMM_KERNEL_4X4_MSA(, +, , +, -);
  1194. #endif
  1195. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1196. CGEMM_KERNEL_4X4_MSA(, +, , -, +);
  1197. #endif
  1198. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1199. CGEMM_KERNEL_4X4_MSA(, -, , -, -);
  1200. #endif
  1201. for (l = (temp - 1); l--;)
  1202. {
  1203. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1204. CGEMM_KERNEL_4X4_MSA(+, -, +, +,);
  1205. #endif
  1206. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1207. CGEMM_KERNEL_4X4_MSA(+, +, -, +,);
  1208. #endif
  1209. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1210. CGEMM_KERNEL_4X4_MSA(+, +, +, -,);
  1211. #endif
  1212. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1213. CGEMM_KERNEL_4X4_MSA(+, -, -, -,);
  1214. #endif
  1215. }
  1216. #if defined(TRMMKERNEL)
  1217. CGEMM_TRMM_SCALE_4X4_MSA
  1218. #else
  1219. CGEMM_SCALE_4X4_MSA
  1220. #endif
  1221. #if defined(TRMMKERNEL)
  1222. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1223. temp = k - off;
  1224. #ifdef LEFT
  1225. temp -= 4; // number of values in A
  1226. #else
  1227. temp -= 4; // number of values in B
  1228. #endif
  1229. pa0 += temp * 2 * 4;
  1230. pb0 += temp * 2 * 4;
  1231. #endif
  1232. #ifdef LEFT
  1233. off += 4; // number of values in A
  1234. #endif
  1235. #endif
  1236. }
  1237. if (m & 2)
  1238. {
  1239. #if defined(TRMMKERNEL)
  1240. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1241. pb0 = B;
  1242. #else
  1243. pa0 += off * 2 * 2;
  1244. pb0 = B + off * 2 * 4;
  1245. #endif
  1246. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1247. temp = k - off;
  1248. #elif defined(LEFT)
  1249. temp = off + 2; // number of values in A
  1250. #else
  1251. temp = off + 4; // number of values in B
  1252. #endif
  1253. #else
  1254. pb0 = B;
  1255. temp = k;
  1256. #endif
  1257. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1258. CGEMM_KERNEL_2X4(, -, , +, +);
  1259. #endif
  1260. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1261. CGEMM_KERNEL_2X4(, +, , +, -);
  1262. #endif
  1263. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1264. CGEMM_KERNEL_2X4(, +, , -, +);
  1265. #endif
  1266. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1267. CGEMM_KERNEL_2X4(, -, , -, -);
  1268. #endif
  1269. pa0 += 4;
  1270. pb0 += 8;
  1271. for (l = (temp - 1); l--;)
  1272. {
  1273. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1274. CGEMM_KERNEL_2X4(+, -, +, +,);
  1275. #endif
  1276. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1277. CGEMM_KERNEL_2X4(+, +, -, +,);
  1278. #endif
  1279. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1280. CGEMM_KERNEL_2X4(+, +, +, -,);
  1281. #endif
  1282. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1283. CGEMM_KERNEL_2X4(+, -, -, -,);
  1284. #endif
  1285. pa0 += 4;
  1286. pb0 += 8;
  1287. }
  1288. #if defined(TRMMKERNEL)
  1289. CGEMM_TRMM_SCALE_2X4
  1290. #else
  1291. CGEMM_SCALE_2X4
  1292. #endif
  1293. pc0 += 4;
  1294. pc1 += 4;
  1295. pc2 += 4;
  1296. pc3 += 4;
  1297. #if defined(TRMMKERNEL)
  1298. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1299. temp = k - off;
  1300. #ifdef LEFT
  1301. temp -= 2; // number of values in A
  1302. #else
  1303. temp -= 4; // number of values in B
  1304. #endif
  1305. pa0 += temp * 2 * 2;
  1306. pb0 += temp * 2 * 4;
  1307. #endif
  1308. #ifdef LEFT
  1309. off += 2; // number of values in A
  1310. #endif
  1311. #endif
  1312. }
  1313. if (m & 1)
  1314. {
  1315. #if defined(TRMMKERNEL)
  1316. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1317. pb0 = B;
  1318. #else
  1319. pa0 += off * 2 * 1;
  1320. pb0 = B + off * 2 * 4;
  1321. #endif
  1322. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1323. temp = k - off;
  1324. #elif defined(LEFT)
  1325. temp = off + 1; // number of values in A
  1326. #else
  1327. temp = off + 4; // number of values in B
  1328. #endif
  1329. #else
  1330. pb0 = B;
  1331. temp = k;
  1332. #endif
  1333. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1334. CGEMM_KERNEL_1X4(, -, , +, +);
  1335. #endif
  1336. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1337. CGEMM_KERNEL_1X4(, +, , +, -);
  1338. #endif
  1339. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1340. CGEMM_KERNEL_1X4(, +, , -, +);
  1341. #endif
  1342. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1343. CGEMM_KERNEL_1X4(, -, , -, -);
  1344. #endif
  1345. pa0 += 2;
  1346. pb0 += 8;
  1347. for (l = (temp - 1); l--;)
  1348. {
  1349. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1350. CGEMM_KERNEL_1X4(+, -, +, +,);
  1351. #endif
  1352. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1353. CGEMM_KERNEL_1X4(+, +, -, +,);
  1354. #endif
  1355. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1356. CGEMM_KERNEL_1X4(+, +, +, -,);
  1357. #endif
  1358. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1359. CGEMM_KERNEL_1X4(+, -, -, -,);
  1360. #endif
  1361. pa0 += 2;
  1362. pb0 += 8;
  1363. }
  1364. #if defined(TRMMKERNEL)
  1365. CGEMM_TRMM_SCALE_1X4
  1366. #else
  1367. CGEMM_SCALE_1X4
  1368. #endif
  1369. pc0 += 2;
  1370. pc1 += 2;
  1371. pc2 += 2;
  1372. pc3 += 2;
  1373. #if defined(TRMMKERNEL)
  1374. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1375. temp = k - off;
  1376. #ifdef LEFT
  1377. temp -= 1; // number of values in A
  1378. #else
  1379. temp -= 4; // number of values in B
  1380. #endif
  1381. pa0 += temp * 2 * 1;
  1382. pb0 += temp * 2 * 4;
  1383. #endif
  1384. #ifdef LEFT
  1385. off += 1; // number of values in A
  1386. #endif
  1387. #endif
  1388. }
  1389. #if defined(TRMMKERNEL) && !defined(LEFT)
  1390. off += 4; // number of values in A
  1391. #endif
  1392. B += (k << 3);
  1393. C += (ldc << 3);
  1394. }
  1395. if (n & 2)
  1396. {
  1397. pc0 = C;
  1398. pc1 = pc0 + 2 * ldc;
  1399. #if defined(TRMMKERNEL) && defined(LEFT)
  1400. off = offset;
  1401. #endif
  1402. pa0 = A;
  1403. for (i = (m >> 3); i--;)
  1404. {
  1405. #if defined(TRMMKERNEL)
  1406. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1407. pb0 = B;
  1408. #else
  1409. pa0 += off * 2 * 8;
  1410. pb0 = B + off * 2 * 2;
  1411. #endif
  1412. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1413. temp = k - off;
  1414. #elif defined(LEFT)
  1415. temp = off + 8; // number of values in A
  1416. #else
  1417. temp = off + 2; // number of values in B
  1418. #endif
  1419. #else
  1420. pb0 = B;
  1421. temp = k;
  1422. #endif
  1423. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1424. CGEMM_KERNEL_8X2_MSA(, -, , +, +);
  1425. #endif
  1426. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1427. CGEMM_KERNEL_8X2_MSA(, +, , +, -);
  1428. #endif
  1429. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1430. CGEMM_KERNEL_8X2_MSA(, +, , -, +);
  1431. #endif
  1432. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1433. CGEMM_KERNEL_8X2_MSA(, -, , -, -);
  1434. #endif
  1435. pb0 += 4;
  1436. for (l = (temp - 1); l--;)
  1437. {
  1438. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1439. CGEMM_KERNEL_8X2_MSA(+, -, +, +,);
  1440. #endif
  1441. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1442. CGEMM_KERNEL_8X2_MSA(+, +, -, +,);
  1443. #endif
  1444. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1445. CGEMM_KERNEL_8X2_MSA(+, +, +, -,);
  1446. #endif
  1447. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1448. CGEMM_KERNEL_8X2_MSA(+, -, -, -,);
  1449. #endif
  1450. pb0 += 4;
  1451. }
  1452. #if defined(TRMMKERNEL)
  1453. CGEMM_TRMM_SCALE_8X2_MSA
  1454. #else
  1455. CGEMM_SCALE_8X2_MSA
  1456. #endif
  1457. #if defined(TRMMKERNEL)
  1458. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1459. temp = k - off;
  1460. #ifdef LEFT
  1461. temp -= 8; // number of values in A
  1462. #else
  1463. temp -= 2; // number of values in B
  1464. #endif
  1465. pa0 += temp * 2 * 8;
  1466. pb0 += temp * 2 * 2;
  1467. #endif
  1468. #ifdef LEFT
  1469. off += 8; // number of values in A
  1470. #endif
  1471. #endif
  1472. }
  1473. if (m & 4)
  1474. {
  1475. #if defined(TRMMKERNEL)
  1476. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1477. pb0 = B;
  1478. #else
  1479. pa0 += off * 2 * 4;
  1480. pb0 = B + off * 2 * 2;
  1481. #endif
  1482. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1483. temp = k - off;
  1484. #elif defined(LEFT)
  1485. temp = off + 4; // number of values in A
  1486. #else
  1487. temp = off + 2; // number of values in B
  1488. #endif
  1489. #else
  1490. pb0 = B;
  1491. temp = k;
  1492. #endif
  1493. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1494. CGEMM_KERNEL_4X2_MSA(, -, , +, +);
  1495. #endif
  1496. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1497. CGEMM_KERNEL_4X2_MSA(, +, , +, -);
  1498. #endif
  1499. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1500. CGEMM_KERNEL_4X2_MSA(, +, , -, +);
  1501. #endif
  1502. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1503. CGEMM_KERNEL_4X2_MSA(, -, , -, -);
  1504. #endif
  1505. pb0 += 4;
  1506. for (l = (temp - 1); l--;)
  1507. {
  1508. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1509. CGEMM_KERNEL_4X2_MSA(+, -, +, +,);
  1510. #endif
  1511. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1512. CGEMM_KERNEL_4X2_MSA(+, +, -, +,);
  1513. #endif
  1514. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1515. CGEMM_KERNEL_4X2_MSA(+, +, +, -,);
  1516. #endif
  1517. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1518. CGEMM_KERNEL_4X2_MSA(+, -, -, -,);
  1519. #endif
  1520. pb0 += 4;
  1521. }
  1522. #if defined(TRMMKERNEL)
  1523. CGEMM_TRMM_SCALE_4X2_MSA
  1524. #else
  1525. CGEMM_SCALE_4X2_MSA
  1526. #endif
  1527. #if defined(TRMMKERNEL)
  1528. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1529. temp = k - off;
  1530. #ifdef LEFT
  1531. temp -= 4; // number of values in A
  1532. #else
  1533. temp -= 2; // number of values in B
  1534. #endif
  1535. pa0 += temp * 2 * 4;
  1536. pb0 += temp * 2 * 2;
  1537. #endif
  1538. #ifdef LEFT
  1539. off += 4; // number of values in A
  1540. #endif
  1541. #endif
  1542. }
  1543. if (m & 2)
  1544. {
  1545. #if defined(TRMMKERNEL)
  1546. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1547. pb0 = B;
  1548. #else
  1549. pa0 += off * 2 * 2;
  1550. pb0 = B + off * 2 * 2;
  1551. #endif
  1552. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1553. temp = k - off;
  1554. #elif defined(LEFT)
  1555. temp = off + 2; // number of values in A
  1556. #else
  1557. temp = off + 2; // number of values in B
  1558. #endif
  1559. #else
  1560. pb0 = B;
  1561. temp = k;
  1562. #endif
  1563. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1564. CGEMM_KERNEL_2X2(, -, , +, +);
  1565. #endif
  1566. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1567. CGEMM_KERNEL_2X2(, +, , +, -);
  1568. #endif
  1569. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1570. CGEMM_KERNEL_2X2(, +, , -, +);
  1571. #endif
  1572. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1573. CGEMM_KERNEL_2X2(, -, , -, -);
  1574. #endif
  1575. pa0 += 4;
  1576. pb0 += 4;
  1577. for (l = (temp - 1); l--;)
  1578. {
  1579. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1580. CGEMM_KERNEL_2X2(+, -, +, +,);
  1581. #endif
  1582. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1583. CGEMM_KERNEL_2X2(+, +, -, +,);
  1584. #endif
  1585. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1586. CGEMM_KERNEL_2X2(+, +, +, -,);
  1587. #endif
  1588. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1589. CGEMM_KERNEL_2X2(+, -, -, -,);
  1590. #endif
  1591. pa0 += 4;
  1592. pb0 += 4;
  1593. }
  1594. #if defined(TRMMKERNEL)
  1595. CGEMM_TRMM_SCALE_2X2
  1596. #else
  1597. CGEMM_SCALE_2X2
  1598. #endif
  1599. pc0 += 4;
  1600. pc1 += 4;
  1601. #if defined(TRMMKERNEL)
  1602. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1603. temp = k - off;
  1604. #ifdef LEFT
  1605. temp -= 2; // number of values in A
  1606. #else
  1607. temp -= 2; // number of values in B
  1608. #endif
  1609. pa0 += temp * 2 * 2;
  1610. pb0 += temp * 2 * 2;
  1611. #endif
  1612. #ifdef LEFT
  1613. off += 2; // number of values in A
  1614. #endif
  1615. #endif
  1616. }
  1617. if (m & 1)
  1618. {
  1619. #if defined(TRMMKERNEL)
  1620. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1621. pb0 = B;
  1622. #else
  1623. pa0 += off * 2 * 1;
  1624. pb0 = B + off * 2 * 2;
  1625. #endif
  1626. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1627. temp = k - off;
  1628. #elif defined(LEFT)
  1629. temp = off + 1; // number of values in A
  1630. #else
  1631. temp = off + 2; // number of values in B
  1632. #endif
  1633. #else
  1634. pb0 = B;
  1635. temp = k;
  1636. #endif
  1637. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1638. CGEMM_KERNEL_1X2(, -, , +, +);
  1639. #endif
  1640. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1641. CGEMM_KERNEL_1X2(, +, , +, -);
  1642. #endif
  1643. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1644. CGEMM_KERNEL_1X2(, +, , -, +);
  1645. #endif
  1646. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1647. CGEMM_KERNEL_1X2(, -, , -, -);
  1648. #endif
  1649. pa0 += 2;
  1650. pb0 += 4;
  1651. for (l = (temp - 1); l--;)
  1652. {
  1653. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1654. CGEMM_KERNEL_1X2(+, -, +, +,);
  1655. #endif
  1656. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1657. CGEMM_KERNEL_1X2(+, +, -, +,);
  1658. #endif
  1659. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1660. CGEMM_KERNEL_1X2(+, +, +, -,);
  1661. #endif
  1662. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1663. CGEMM_KERNEL_1X2(+, -, -, -,);
  1664. #endif
  1665. pa0 += 2;
  1666. pb0 += 4;
  1667. }
  1668. #if defined(TRMMKERNEL)
  1669. CGEMM_TRMM_SCALE_1X2
  1670. #else
  1671. CGEMM_SCALE_1X2
  1672. #endif
  1673. pc0 += 2;
  1674. pc1 += 2;
  1675. #if defined(TRMMKERNEL)
  1676. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1677. temp = k - off;
  1678. #ifdef LEFT
  1679. temp -= 1; // number of values in A
  1680. #else
  1681. temp -= 2; // number of values in B
  1682. #endif
  1683. pa0 += temp * 2 * 1;
  1684. pb0 += temp * 2 * 2;
  1685. #endif
  1686. #ifdef LEFT
  1687. off += 1; // number of values in A
  1688. #endif
  1689. #endif
  1690. }
  1691. #if defined(TRMMKERNEL) && !defined(LEFT)
  1692. off += 2; // number of values in A
  1693. #endif
  1694. B += (k << 2);
  1695. C += (ldc << 2);
  1696. }
  1697. if (n & 1)
  1698. {
  1699. pc0 = C;
  1700. #if defined(TRMMKERNEL) && defined(LEFT)
  1701. off = offset;
  1702. #endif
  1703. pa0 = A;
  1704. for (i = (m >> 3); i--;)
  1705. {
  1706. #if defined(TRMMKERNEL)
  1707. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1708. pb0 = B;
  1709. #else
  1710. pa0 += off * 2 * 8;
  1711. pb0 = B + off * 2 * 1;
  1712. #endif
  1713. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1714. temp = k - off;
  1715. #elif defined(LEFT)
  1716. temp = off + 8; // number of values in A
  1717. #else
  1718. temp = off + 1; // number of values in B
  1719. #endif
  1720. #else
  1721. pb0 = B;
  1722. temp = k;
  1723. #endif
  1724. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1725. CGEMM_KERNEL_8X1_MSA(, -, , +, +);
  1726. #endif
  1727. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1728. CGEMM_KERNEL_8X1_MSA(, +, , +, -);
  1729. #endif
  1730. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1731. CGEMM_KERNEL_8X1_MSA(, +, , -, +);
  1732. #endif
  1733. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1734. CGEMM_KERNEL_8X1_MSA(, -, , -, -);
  1735. #endif
  1736. pb0 += 2;
  1737. for (l = (temp - 1); l--;)
  1738. {
  1739. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1740. CGEMM_KERNEL_8X1_MSA(+, -, +, +,);
  1741. #endif
  1742. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1743. CGEMM_KERNEL_8X1_MSA(+, +, -, +,);
  1744. #endif
  1745. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1746. CGEMM_KERNEL_8X1_MSA(+, +, +, -,);
  1747. #endif
  1748. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1749. CGEMM_KERNEL_8X1_MSA(+, -, -, -,);
  1750. #endif
  1751. pb0 += 2;
  1752. }
  1753. #if defined(TRMMKERNEL)
  1754. CGEMM_TRMM_SCALE_8X1_MSA
  1755. #else
  1756. CGEMM_SCALE_8X1_MSA
  1757. #endif
  1758. #if defined(TRMMKERNEL)
  1759. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1760. temp = k - off;
  1761. #ifdef LEFT
  1762. temp -= 8; // number of values in A
  1763. #else
  1764. temp -= 1; // number of values in B
  1765. #endif
  1766. pa0 += temp * 2 * 8;
  1767. pb0 += temp * 2 * 1;
  1768. #endif
  1769. #ifdef LEFT
  1770. off += 8; // number of values in A
  1771. #endif
  1772. #endif
  1773. }
  1774. if (m & 4)
  1775. {
  1776. #if defined(TRMMKERNEL)
  1777. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1778. pb0 = B;
  1779. #else
  1780. pa0 += off * 2 * 4;
  1781. pb0 = B + off * 2 * 1;
  1782. #endif
  1783. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1784. temp = k - off;
  1785. #elif defined(LEFT)
  1786. temp = off + 4; // number of values in A
  1787. #else
  1788. temp = off + 1; // number of values in B
  1789. #endif
  1790. #else
  1791. pb0 = B;
  1792. temp = k;
  1793. #endif
  1794. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1795. CGEMM_KERNEL_4X1_MSA(, -, , +, +);
  1796. #endif
  1797. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1798. CGEMM_KERNEL_4X1_MSA(, +, , +, -);
  1799. #endif
  1800. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1801. CGEMM_KERNEL_4X1_MSA(, +, , -, +);
  1802. #endif
  1803. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1804. CGEMM_KERNEL_4X1_MSA(, -, , -, -);
  1805. #endif
  1806. pb0 += 2;
  1807. for (l = (temp - 1); l--;)
  1808. {
  1809. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1810. CGEMM_KERNEL_4X1_MSA(+, -, +, +,);
  1811. #endif
  1812. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1813. CGEMM_KERNEL_4X1_MSA(+, +, -, +,);
  1814. #endif
  1815. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1816. CGEMM_KERNEL_4X1_MSA(+, +, +, -,);
  1817. #endif
  1818. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1819. CGEMM_KERNEL_4X1_MSA(+, -, -, -,);
  1820. #endif
  1821. pb0 += 2;
  1822. }
  1823. #if defined(TRMMKERNEL)
  1824. CGEMM_TRMM_SCALE_4X1_MSA
  1825. #else
  1826. CGEMM_SCALE_4X1_MSA
  1827. #endif
  1828. #if defined(TRMMKERNEL)
  1829. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1830. temp = k - off;
  1831. #ifdef LEFT
  1832. temp -= 4; // number of values in A
  1833. #else
  1834. temp -= 1; // number of values in B
  1835. #endif
  1836. pa0 += temp * 2 * 4;
  1837. pb0 += temp * 2 * 1;
  1838. #endif
  1839. #ifdef LEFT
  1840. off += 4; // number of values in A
  1841. #endif
  1842. #endif
  1843. }
  1844. if (m & 2)
  1845. {
  1846. #if defined(TRMMKERNEL)
  1847. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1848. pb0 = B;
  1849. #else
  1850. pa0 += off * 2 * 2;
  1851. pb0 = B + off * 2 * 1;
  1852. #endif
  1853. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1854. temp = k - off;
  1855. #elif defined(LEFT)
  1856. temp = off + 2; // number of values in A
  1857. #else
  1858. temp = off + 1; // number of values in B
  1859. #endif
  1860. #else
  1861. pb0 = B;
  1862. temp = k;
  1863. #endif
  1864. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1865. CGEMM_KERNEL_2X1(, -, , +, +);
  1866. #endif
  1867. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1868. CGEMM_KERNEL_2X1(, +, , +, -);
  1869. #endif
  1870. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1871. CGEMM_KERNEL_2X1(, +, , -, +);
  1872. #endif
  1873. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1874. CGEMM_KERNEL_2X1(, -, , -, -);
  1875. #endif
  1876. pa0 += 4;
  1877. pb0 += 2;
  1878. for (l = (temp - 1); l--;)
  1879. {
  1880. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1881. CGEMM_KERNEL_2X1(+, -, +, +,);
  1882. #endif
  1883. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1884. CGEMM_KERNEL_2X1(+, +, -, +,);
  1885. #endif
  1886. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1887. CGEMM_KERNEL_2X1(+, +, +, -,);
  1888. #endif
  1889. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1890. CGEMM_KERNEL_2X1(+, -, -, -,);
  1891. #endif
  1892. pa0 += 4;
  1893. pb0 += 2;
  1894. }
  1895. #if defined(TRMMKERNEL)
  1896. CGEMM_TRMM_SCALE_2X1
  1897. #else
  1898. CGEMM_SCALE_2X1
  1899. #endif
  1900. pc0 += 4;
  1901. #if defined(TRMMKERNEL)
  1902. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1903. temp = k - off;
  1904. #ifdef LEFT
  1905. temp -= 2; // number of values in A
  1906. #else
  1907. temp -= 1; // number of values in B
  1908. #endif
  1909. pa0 += temp * 2 * 2;
  1910. pb0 += temp * 2 * 1;
  1911. #endif
  1912. #ifdef LEFT
  1913. off += 2; // number of values in A
  1914. #endif
  1915. #endif
  1916. }
  1917. if (m & 1)
  1918. {
  1919. #if defined(TRMMKERNEL)
  1920. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1921. pb0 = B;
  1922. #else
  1923. pa0 += off * 2 * 1;
  1924. pb0 = B + off * 2 * 1;
  1925. #endif
  1926. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1927. temp = k - off;
  1928. #elif defined(LEFT)
  1929. temp = off + 1; // number of values in A
  1930. #else
  1931. temp = off + 1; // number of values in B
  1932. #endif
  1933. #else
  1934. pb0 = B;
  1935. temp = k;
  1936. #endif
  1937. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1938. CGEMM_KERNEL_1X1(, -, , +, +);
  1939. #endif
  1940. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1941. CGEMM_KERNEL_1X1(, +, , +, -);
  1942. #endif
  1943. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1944. CGEMM_KERNEL_1X1(, +, , -, +);
  1945. #endif
  1946. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1947. CGEMM_KERNEL_1X1(, -, , -, -);
  1948. #endif
  1949. pa0 += 2;
  1950. pb0 += 2;
  1951. for (l = (temp - 1); l--;)
  1952. {
  1953. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1954. CGEMM_KERNEL_1X1(+, -, +, +,);
  1955. #endif
  1956. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1957. CGEMM_KERNEL_1X1(+, +, -, +,);
  1958. #endif
  1959. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1960. CGEMM_KERNEL_1X1(+, +, +, -,);
  1961. #endif
  1962. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1963. CGEMM_KERNEL_1X1(+, -, -, -,);
  1964. #endif
  1965. pa0 += 2;
  1966. pb0 += 2;
  1967. }
  1968. #if defined(TRMMKERNEL)
  1969. CGEMM_TRMM_SCALE_1X1
  1970. #else
  1971. CGEMM_SCALE_1X1
  1972. #endif
  1973. pc0 += 2;
  1974. #if defined(TRMMKERNEL)
  1975. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1976. temp = k - off;
  1977. #ifdef LEFT
  1978. temp -= 1; // number of values in A
  1979. #else
  1980. temp -= 1; // number of values in B
  1981. #endif
  1982. pa0 += temp * 2 * 1;
  1983. pb0 += temp * 2 * 1;
  1984. #endif
  1985. #ifdef LEFT
  1986. off += 1; // number of values in A
  1987. #endif
  1988. #endif
  1989. }
  1990. #if defined(TRMMKERNEL) && !defined(LEFT)
  1991. off += 1; // number of values in A
  1992. #endif
  1993. B += (k << 1);
  1994. C += (ldc << 1);
  1995. }
  1996. return 0;
  1997. }