You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x4_msa.c 49 kB


  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. static void __attribute__ ((noinline))
  30. dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha,
  31. FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
  32. #ifdef TRMMKERNEL
  33. , BLASLONG offset
  34. #endif
  35. )
  36. {
  37. BLASLONG i, j, l, temp;
  38. #if defined(TRMMKERNEL)
  39. BLASLONG off;
  40. #endif
  41. FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
  42. v2f64 v_alpha = {alpha, alpha};
  43. v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1;
  44. v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  45. v2f64 res0, res1, res2, res3, res4, res5, res6, res7;
  46. v2f64 res8, res9, res10, res11, res12, res13, res14, res15;
  47. #if defined(TRMMKERNEL) && !defined(LEFT)
  48. off = -offset;
  49. #endif
  50. for (j = (n >> 2); j--;)
  51. {
  52. pc0 = C;
  53. pc1 = pc0 + ldc;
  54. pc2 = pc1 + ldc;
  55. pc3 = pc2 + ldc;
  56. #if defined(TRMMKERNEL) && defined(LEFT)
  57. off = offset;
  58. #endif
  59. pa0 = A;
  60. for (i = (m >> 3); i--;)
  61. {
  62. #if defined(TRMMKERNEL)
  63. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  64. pb0 = B;
  65. #else
  66. pa0 += off * 8;
  67. pb0 = B + off * 4;
  68. #endif
  69. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  70. temp = k - off;
  71. #elif defined(LEFT)
  72. temp = off + 8; // number of values in A
  73. #else
  74. temp = off + 4; // number of values in B
  75. #endif
  76. #else // #if !defined(TRMMKERNEL)
  77. pb0 = B;
  78. temp = k;
  79. #endif
  80. #ifdef ENABLE_PREFETCH
  81. __asm__ __volatile__(
  82. "pref 0, 64(%[pa0]) \n\t"
  83. "pref 0, 96(%[pa0]) \n\t"
  84. "pref 0, 128(%[pa0]) \n\t"
  85. "pref 0, 160(%[pa0]) \n\t"
  86. "pref 0, 32(%[pb0]) \n\t"
  87. "pref 0, 64(%[pb0]) \n\t"
  88. "pref 0, 96(%[pb0]) \n\t"
  89. :
  90. : [pa0] "r" (pa0), [pb0] "r" (pb0)
  91. );
  92. #endif
  93. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  94. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  95. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  96. res0 = src_a0 * src_b;
  97. res1 = src_a1 * src_b;
  98. res2 = src_a2 * src_b;
  99. res3 = src_a3 * src_b;
  100. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  101. res4 = src_a0 * src_b;
  102. res5 = src_a1 * src_b;
  103. res6 = src_a2 * src_b;
  104. res7 = src_a3 * src_b;
  105. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  106. res8 = src_a0 * src_b;
  107. res9 = src_a1 * src_b;
  108. res10 = src_a2 * src_b;
  109. res11 = src_a3 * src_b;
  110. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  111. res12 = src_a0 * src_b;
  112. res13 = src_a1 * src_b;
  113. res14 = src_a2 * src_b;
  114. res15 = src_a3 * src_b;
  115. for (l = ((temp - 1) >> 1); l--;)
  116. {
  117. #ifdef ENABLE_PREFETCH
  118. __asm__ __volatile__(
  119. "pref 0, 128(%[pa0]) \n\t"
  120. "pref 0, 160(%[pa0]) \n\t"
  121. "pref 0, 192(%[pa0]) \n\t"
  122. "pref 0, 224(%[pa0]) \n\t"
  123. :
  124. : [pa0] "r" (pa0)
  125. );
  126. #endif
  127. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  128. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  129. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  130. res0 += src_a0 * src_b;
  131. res1 += src_a1 * src_b;
  132. res2 += src_a2 * src_b;
  133. res3 += src_a3 * src_b;
  134. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  135. res4 += src_a0 * src_b;
  136. res5 += src_a1 * src_b;
  137. res6 += src_a2 * src_b;
  138. res7 += src_a3 * src_b;
  139. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  140. res8 += src_a0 * src_b;
  141. res9 += src_a1 * src_b;
  142. res10 += src_a2 * src_b;
  143. res11 += src_a3 * src_b;
  144. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  145. res12 += src_a0 * src_b;
  146. res13 += src_a1 * src_b;
  147. res14 += src_a2 * src_b;
  148. res15 += src_a3 * src_b;
  149. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  150. #ifdef ENABLE_PREFETCH
  151. __asm__ __volatile__(
  152. "pref 0, 64(%[pb0]) \n\t"
  153. "pref 0, 96(%[pb0]) \n\t"
  154. :
  155. : [pb0] "r" (pb0)
  156. );
  157. #endif
  158. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  159. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  160. res0 += src_a0 * src_b;
  161. res1 += src_a1 * src_b;
  162. res2 += src_a2 * src_b;
  163. res3 += src_a3 * src_b;
  164. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  165. res4 += src_a0 * src_b;
  166. res5 += src_a1 * src_b;
  167. res6 += src_a2 * src_b;
  168. res7 += src_a3 * src_b;
  169. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  170. res8 += src_a0 * src_b;
  171. res9 += src_a1 * src_b;
  172. res10 += src_a2 * src_b;
  173. res11 += src_a3 * src_b;
  174. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  175. res12 += src_a0 * src_b;
  176. res13 += src_a1 * src_b;
  177. res14 += src_a2 * src_b;
  178. res15 += src_a3 * src_b;
  179. }
  180. if ((temp - 1) & 1)
  181. {
  182. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  183. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  184. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  185. res0 += src_a0 * src_b;
  186. res1 += src_a1 * src_b;
  187. res2 += src_a2 * src_b;
  188. res3 += src_a3 * src_b;
  189. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  190. res4 += src_a0 * src_b;
  191. res5 += src_a1 * src_b;
  192. res6 += src_a2 * src_b;
  193. res7 += src_a3 * src_b;
  194. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  195. res8 += src_a0 * src_b;
  196. res9 += src_a1 * src_b;
  197. res10 += src_a2 * src_b;
  198. res11 += src_a3 * src_b;
  199. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  200. res12 += src_a0 * src_b;
  201. res13 += src_a1 * src_b;
  202. res14 += src_a2 * src_b;
  203. res15 += src_a3 * src_b;
  204. }
  205. #ifdef ENABLE_PREFETCH
  206. __asm__ __volatile__(
  207. "pref 0, 64(%[pc0]) \n\t"
  208. "pref 0, 64(%[pc1]) \n\t"
  209. "pref 0, 64(%[pc2]) \n\t"
  210. "pref 0, 64(%[pc3]) \n\t"
  211. :
  212. : [pc0] "r" (pc0), [pc1] "r" (pc1),
  213. [pc2] "r" (pc2), [pc3] "r" (pc3)
  214. );
  215. #endif
  216. #if defined(TRMMKERNEL)
  217. dst0 = res0 * v_alpha;
  218. dst1 = res1 * v_alpha;
  219. dst2 = res2 * v_alpha;
  220. dst3 = res3 * v_alpha;
  221. dst4 = res4 * v_alpha;
  222. dst5 = res5 * v_alpha;
  223. dst6 = res6 * v_alpha;
  224. dst7 = res7 * v_alpha;
  225. #else
  226. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
  227. LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
  228. dst0 += res0 * v_alpha;
  229. dst1 += res1 * v_alpha;
  230. dst2 += res2 * v_alpha;
  231. dst3 += res3 * v_alpha;
  232. dst4 += res4 * v_alpha;
  233. dst5 += res5 * v_alpha;
  234. dst6 += res6 * v_alpha;
  235. dst7 += res7 * v_alpha;
  236. #endif
  237. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
  238. ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
  239. #if defined(TRMMKERNEL)
  240. dst0 = res8 * v_alpha;
  241. dst1 = res9 * v_alpha;
  242. dst2 = res10 * v_alpha;
  243. dst3 = res11 * v_alpha;
  244. dst4 = res12 * v_alpha;
  245. dst5 = res13 * v_alpha;
  246. dst6 = res14 * v_alpha;
  247. dst7 = res15 * v_alpha;
  248. #else
  249. LD_DP4(pc2, 2, dst0, dst1, dst2, dst3);
  250. LD_DP4(pc3, 2, dst4, dst5, dst6, dst7);
  251. dst0 += res8 * v_alpha;
  252. dst1 += res9 * v_alpha;
  253. dst2 += res10 * v_alpha;
  254. dst3 += res11 * v_alpha;
  255. dst4 += res12 * v_alpha;
  256. dst5 += res13 * v_alpha;
  257. dst6 += res14 * v_alpha;
  258. dst7 += res15 * v_alpha;
  259. #endif
  260. ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);
  261. ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);
  262. #if defined(TRMMKERNEL)
  263. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  264. temp = k - off;
  265. #ifdef LEFT
  266. temp -= 8; // number of values in A
  267. #else
  268. temp -= 4; // number of values in B
  269. #endif
  270. pa0 += temp * 8;
  271. pb0 += temp * 4;
  272. #endif
  273. #ifdef LEFT
  274. off += 8; // number of values in A
  275. #endif
  276. #endif // #if defined(TRMMKERNEL)
  277. }
  278. #if defined(TRMMKERNEL) && !defined(LEFT)
  279. off += 4; // number of values in A
  280. #endif
  281. B += (k << 2);
  282. C += (ldc << 2);
  283. }
  284. }
  285. static void __attribute__ ((noinline))
  286. dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha,
  287. FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
  288. #ifdef TRMMKERNEL
  289. , BLASLONG offset
  290. #endif
  291. )
  292. {
  293. BLASLONG j, l, temp;
  294. #if defined(TRMMKERNEL)
  295. BLASLONG off;
  296. #endif
  297. FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
  298. FLOAT tmp0, tmp1, tmp2, tmp3;
  299. FLOAT a0, b0, b1, b2, b3;
  300. v2f64 v_alpha = {alpha, alpha};
  301. v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
  302. v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  303. v2f64 res0, res1, res2, res3, res4, res5, res6, res7;
  304. #if defined(TRMMKERNEL) && !defined(LEFT)
  305. off = -offset;
  306. #endif
  307. for (j = (n >> 2); j--;)
  308. {
  309. #if defined(TRMMKERNEL)
  310. pc0 = C;
  311. pc1 = pc0 + ldc;
  312. pc2 = pc1 + ldc;
  313. pc3 = pc2 + ldc;
  314. pa0 = A;
  315. #if defined(LEFT)
  316. off = offset;
  317. #endif
  318. for (l = (m >> 3); l--;)
  319. {
  320. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  321. pb0 = B;
  322. #else
  323. pa0 += off * 8;
  324. pb0 = B + off * 4;
  325. #endif
  326. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  327. temp = k - off;
  328. #elif defined(LEFT)
  329. temp = off + 8; // number of values in A
  330. #else
  331. temp = off + 4; // number of values in B
  332. #endif
  333. pc0 += 8;
  334. pc1 += 8;
  335. pc2 += 8;
  336. pc3 += 8;
  337. pa0 += 8 * temp;
  338. pb0 += 4 * temp;
  339. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  340. temp = k - off;
  341. #ifdef LEFT
  342. temp -= 8; // number of values in A
  343. #else
  344. temp -= 4; // number of values in B
  345. #endif
  346. pa0 += temp * 8;
  347. pb0 += temp * 4;
  348. #endif
  349. #ifdef LEFT
  350. off += 8; // number of values in A
  351. #endif
  352. }
  353. #else // #if !defined(TRMMKERNEL)
  354. pc0 = C + 8 * (m >> 3);
  355. pc1 = pc0 + ldc;
  356. pc2 = pc1 + ldc;
  357. pc3 = pc2 + ldc;
  358. pa0 = A + k * 8 * (m >> 3);
  359. #endif
  360. if (m & 4)
  361. {
  362. #if defined(TRMMKERNEL)
  363. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  364. pb0 = B;
  365. #else
  366. pa0 += off * 4;
  367. pb0 = B + off * 4;
  368. #endif
  369. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  370. temp = k - off;
  371. #elif defined(LEFT)
  372. temp = off + 4; // number of values in A
  373. #else
  374. temp = off + 4; // number of values in B
  375. #endif
  376. #else // #if !defined(TRMMKERNEL)
  377. pb0 = B;
  378. temp = k;
  379. #endif
  380. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  381. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  382. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  383. res0 = src_a0 * src_b;
  384. res1 = src_a1 * src_b;
  385. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  386. res2 = src_a0 * src_b;
  387. res3 = src_a1 * src_b;
  388. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  389. res4 = src_a0 * src_b;
  390. res5 = src_a1 * src_b;
  391. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  392. res6 = src_a0 * src_b;
  393. res7 = src_a1 * src_b;
  394. for (l = ((temp - 1) >> 1); l--;)
  395. {
  396. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  397. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  398. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  399. res0 += src_a0 * src_b;
  400. res1 += src_a1 * src_b;
  401. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  402. res2 += src_a0 * src_b;
  403. res3 += src_a1 * src_b;
  404. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  405. res4 += src_a0 * src_b;
  406. res5 += src_a1 * src_b;
  407. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  408. res6 += src_a0 * src_b;
  409. res7 += src_a1 * src_b;
  410. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  411. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  412. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  413. res0 += src_a0 * src_b;
  414. res1 += src_a1 * src_b;
  415. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  416. res2 += src_a0 * src_b;
  417. res3 += src_a1 * src_b;
  418. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  419. res4 += src_a0 * src_b;
  420. res5 += src_a1 * src_b;
  421. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  422. res6 += src_a0 * src_b;
  423. res7 += src_a1 * src_b;
  424. }
  425. if ((temp - 1) & 1)
  426. {
  427. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  428. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  429. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  430. res0 += src_a0 * src_b;
  431. res1 += src_a1 * src_b;
  432. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  433. res2 += src_a0 * src_b;
  434. res3 += src_a1 * src_b;
  435. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  436. res4 += src_a0 * src_b;
  437. res5 += src_a1 * src_b;
  438. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  439. res6 += src_a0 * src_b;
  440. res7 += src_a1 * src_b;
  441. }
  442. #if defined(TRMMKERNEL)
  443. dst0 = res0 * v_alpha;
  444. dst1 = res1 * v_alpha;
  445. dst2 = res2 * v_alpha;
  446. dst3 = res3 * v_alpha;
  447. dst4 = res4 * v_alpha;
  448. dst5 = res5 * v_alpha;
  449. dst6 = res6 * v_alpha;
  450. dst7 = res7 * v_alpha;
  451. #else
  452. LD_DP2(pc0, 2, dst0, dst1);
  453. LD_DP2(pc1, 2, dst2, dst3);
  454. LD_DP2(pc2, 2, dst4, dst5);
  455. LD_DP2(pc3, 2, dst6, dst7);
  456. dst0 += res0 * v_alpha;
  457. dst1 += res1 * v_alpha;
  458. dst2 += res2 * v_alpha;
  459. dst3 += res3 * v_alpha;
  460. dst4 += res4 * v_alpha;
  461. dst5 += res5 * v_alpha;
  462. dst6 += res6 * v_alpha;
  463. dst7 += res7 * v_alpha;
  464. #endif
  465. ST_DP2_INC(dst0, dst1, pc0, 2);
  466. ST_DP2_INC(dst2, dst3, pc1, 2);
  467. ST_DP2_INC(dst4, dst5, pc2, 2);
  468. ST_DP2_INC(dst6, dst7, pc3, 2);
  469. #if defined(TRMMKERNEL)
  470. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  471. temp = k - off;
  472. #ifdef LEFT
  473. temp -= 4; // number of values in A
  474. #else
  475. temp -= 4; // number of values in B
  476. #endif
  477. pa0 += temp * 4;
  478. pb0 += temp * 4;
  479. #endif
  480. #ifdef LEFT
  481. off += 4; // number of values in A
  482. #endif
  483. #endif // #if defined(TRMMKERNEL)
  484. }
  485. if (m & 2)
  486. {
  487. #if defined(TRMMKERNEL)
  488. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  489. pb0 = B;
  490. #else
  491. pa0 += off * 2;
  492. pb0 = B + off * 4;
  493. #endif
  494. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  495. temp = k - off;
  496. #elif defined(LEFT)
  497. temp = off + 2; // number of values in A
  498. #else
  499. temp = off + 4; // number of values in B
  500. #endif
  501. #else // #if !defined(TRMMKERNEL)
  502. pb0 = B;
  503. temp = k;
  504. #endif
  505. src_a0 = LD_DP(pa0);
  506. pa0 += 2;
  507. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  508. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  509. res0 = src_a0 * src_b;
  510. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  511. res1 = src_a0 * src_b;
  512. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  513. res2 = src_a0 * src_b;
  514. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  515. res3 = src_a0 * src_b;
  516. for (l = ((temp - 1) >> 1); l--;)
  517. {
  518. src_a0 = LD_DP(pa0);
  519. pa0 += 2;
  520. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  521. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  522. res0 += src_a0 * src_b;
  523. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  524. res1 += src_a0 * src_b;
  525. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  526. res2 += src_a0 * src_b;
  527. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  528. res3 += src_a0 * src_b;
  529. src_a0 = LD_DP(pa0);
  530. pa0 += 2;
  531. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  532. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  533. res0 += src_a0 * src_b;
  534. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  535. res1 += src_a0 * src_b;
  536. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  537. res2 += src_a0 * src_b;
  538. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  539. res3 += src_a0 * src_b;
  540. }
  541. if ((temp - 1) & 1)
  542. {
  543. src_a0 = LD_DP(pa0);
  544. pa0 += 2;
  545. LD_DP2_INC(pb0, 2, src_b0, src_b1);
  546. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  547. res0 += src_a0 * src_b;
  548. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  549. res1 += src_a0 * src_b;
  550. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
  551. res2 += src_a0 * src_b;
  552. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
  553. res3 += src_a0 * src_b;
  554. }
  555. #if defined(TRMMKERNEL)
  556. dst0 = res0 * v_alpha;
  557. dst1 = res1 * v_alpha;
  558. dst2 = res2 * v_alpha;
  559. dst3 = res3 * v_alpha;
  560. #else
  561. dst0 = LD_DP(pc0);
  562. dst1 = LD_DP(pc1);
  563. dst2 = LD_DP(pc2);
  564. dst3 = LD_DP(pc3);
  565. dst0 += res0 * v_alpha;
  566. dst1 += res1 * v_alpha;
  567. dst2 += res2 * v_alpha;
  568. dst3 += res3 * v_alpha;
  569. #endif
  570. ST_DP(dst0, pc0);
  571. ST_DP(dst1, pc1);
  572. ST_DP(dst2, pc2);
  573. ST_DP(dst3, pc3);
  574. pc0 += 2;
  575. pc1 += 2;
  576. pc2 += 2;
  577. pc3 += 2;
  578. #if defined(TRMMKERNEL)
  579. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  580. temp = k - off;
  581. #ifdef LEFT
  582. temp -= 2; // number of values in A
  583. #else
  584. temp -= 4; // number of values in B
  585. #endif
  586. pa0 += temp * 2;
  587. pb0 += temp * 4;
  588. #endif
  589. #ifdef LEFT
  590. off += 2; // number of values in A
  591. #endif
  592. #endif // #if defined(TRMMKERNEL)
  593. }
  594. if (m & 1)
  595. {
  596. #if defined(TRMMKERNEL)
  597. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  598. pb0 = B;
  599. #else
  600. pa0 += off * 1;
  601. pb0 = B + off * 4;
  602. #endif
  603. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  604. temp = k - off;
  605. #elif defined(LEFT)
  606. temp = off + 1; // number of values in A
  607. #else
  608. temp = off + 4; // number of values in B
  609. #endif
  610. #else // #if !defined(TRMMKERNEL)
  611. pb0 = B;
  612. temp = k;
  613. #endif
  614. a0 = pa0[0];
  615. b0 = pb0[0];
  616. tmp0 = a0 * b0;
  617. b1 = pb0[1];
  618. tmp1 = a0 * b1;
  619. b2 = pb0[2];
  620. tmp2 = a0 * b2;
  621. b3 = pb0[3];
  622. tmp3 = a0 * b3;
  623. pa0 += 1;
  624. pb0 += 4;
  625. for (l = ((temp - 1) >> 1); l--;)
  626. {
  627. a0 = pa0[0];
  628. b0 = pb0[0];
  629. tmp0 += a0 * b0;
  630. b1 = pb0[1];
  631. tmp1 += a0 * b1;
  632. b2 = pb0[2];
  633. tmp2 += a0 * b2;
  634. b3 = pb0[3];
  635. tmp3 += a0 * b3;
  636. pa0 += 1;
  637. pb0 += 4;
  638. a0 = pa0[0];
  639. b0 = pb0[0];
  640. tmp0 += a0 * b0;
  641. b1 = pb0[1];
  642. tmp1 += a0 * b1;
  643. b2 = pb0[2];
  644. tmp2 += a0 * b2;
  645. b3 = pb0[3];
  646. tmp3 += a0 * b3;
  647. pa0 += 1;
  648. pb0 += 4;
  649. }
  650. if ((temp - 1) & 1)
  651. {
  652. a0 = pa0[0];
  653. b0 = pb0[0];
  654. tmp0 += a0 * b0;
  655. b1 = pb0[1];
  656. tmp1 += a0 * b1;
  657. b2 = pb0[2];
  658. tmp2 += a0 * b2;
  659. b3 = pb0[3];
  660. tmp3 += a0 * b3;
  661. pa0 += 1;
  662. pb0 += 4;
  663. }
  664. tmp0 = alpha * tmp0;
  665. tmp1 = alpha * tmp1;
  666. tmp2 = alpha * tmp2;
  667. tmp3 = alpha * tmp3;
  668. #if defined(TRMMKERNEL)
  669. pc0[0] = tmp0;
  670. pc1[0] = tmp1;
  671. pc2[0] = tmp2;
  672. pc3[0] = tmp3;
  673. #else
  674. pc0[0] += tmp0;
  675. pc1[0] += tmp1;
  676. pc2[0] += tmp2;
  677. pc3[0] += tmp3;
  678. #endif
  679. pc0 += 1;
  680. pc1 += 1;
  681. pc2 += 1;
  682. pc3 += 1;
  683. #if defined(TRMMKERNEL)
  684. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  685. temp = k - off;
  686. #ifdef LEFT
  687. temp -= 1; // number of values in A
  688. #else
  689. temp -= 4; // number of values in B
  690. #endif
  691. pa0 += temp * 1;
  692. pb0 += temp * 4;
  693. #endif
  694. #ifdef LEFT
  695. off += 1; // number of values in A
  696. #endif
  697. #endif // #if defined(TRMMKERNEL)
  698. }
  699. #if defined(TRMMKERNEL) && !defined(LEFT)
  700. off += 4; // number of values in A
  701. #endif
  702. B += (k << 2);
  703. C += (ldc << 2);
  704. }
  705. }
  706. static void __attribute__ ((noinline))
  707. dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha,
  708. FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
  709. #ifdef TRMMKERNEL
  710. , BLASLONG offset
  711. #endif
  712. )
  713. {
  714. BLASLONG i, l, temp;
  715. #if defined(TRMMKERNEL)
  716. BLASLONG off;
  717. #endif
  718. FLOAT *pc0, *pc1, *pa0, *pb0;
  719. FLOAT tmp0, tmp1;
  720. FLOAT a0, b0, b1;
  721. v2f64 v_alpha = {alpha, alpha};
  722. v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
  723. v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  724. v2f64 res0, res1, res2, res3, res4, res5, res6, res7;
  725. #if defined(TRMMKERNEL) && !defined(LEFT)
  726. off = -offset + (4 * (n >> 2));
  727. #endif
  728. if (n & 2)
  729. {
  730. pc0 = C;
  731. pc1 = pc0 + ldc;
  732. #if defined(TRMMKERNEL) && defined(LEFT)
  733. off = offset;
  734. #endif
  735. pa0 = A;
  736. for (i = (m >> 3); i--;)
  737. {
  738. #if defined(TRMMKERNEL)
  739. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  740. pb0 = B;
  741. #else
  742. pa0 += off * 8;
  743. pb0 = B + off * 2;
  744. #endif
  745. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  746. temp = k - off;
  747. #elif defined(LEFT)
  748. temp = off + 8; // number of values in A
  749. #else
  750. temp = off + 2; // number of values in B
  751. #endif
  752. #else // #if !defined(TRMMKERNEL)
  753. pb0 = B;
  754. temp = k;
  755. #endif
  756. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  757. src_b0 = LD_DP(pb0);
  758. pb0 += 2;
  759. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  760. res0 = src_a0 * src_b;
  761. res1 = src_a1 * src_b;
  762. res2 = src_a2 * src_b;
  763. res3 = src_a3 * src_b;
  764. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  765. res4 = src_a0 * src_b;
  766. res5 = src_a1 * src_b;
  767. res6 = src_a2 * src_b;
  768. res7 = src_a3 * src_b;
  769. for (l = ((temp - 1) >> 1); l--;)
  770. {
  771. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  772. src_b0 = LD_DP(pb0);
  773. pb0 += 2;
  774. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  775. res0 += src_a0 * src_b;
  776. res1 += src_a1 * src_b;
  777. res2 += src_a2 * src_b;
  778. res3 += src_a3 * src_b;
  779. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  780. res4 += src_a0 * src_b;
  781. res5 += src_a1 * src_b;
  782. res6 += src_a2 * src_b;
  783. res7 += src_a3 * src_b;
  784. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  785. src_b0 = LD_DP(pb0);
  786. pb0 += 2;
  787. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  788. res0 += src_a0 * src_b;
  789. res1 += src_a1 * src_b;
  790. res2 += src_a2 * src_b;
  791. res3 += src_a3 * src_b;
  792. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  793. res4 += src_a0 * src_b;
  794. res5 += src_a1 * src_b;
  795. res6 += src_a2 * src_b;
  796. res7 += src_a3 * src_b;
  797. }
  798. if ((temp - 1) & 1)
  799. {
  800. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  801. src_b0 = LD_DP(pb0);
  802. pb0 += 2;
  803. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  804. res0 += src_a0 * src_b;
  805. res1 += src_a1 * src_b;
  806. res2 += src_a2 * src_b;
  807. res3 += src_a3 * src_b;
  808. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  809. res4 += src_a0 * src_b;
  810. res5 += src_a1 * src_b;
  811. res6 += src_a2 * src_b;
  812. res7 += src_a3 * src_b;
  813. }
  814. #if defined(TRMMKERNEL)
  815. dst0 = res0 * v_alpha;
  816. dst1 = res1 * v_alpha;
  817. dst2 = res2 * v_alpha;
  818. dst3 = res3 * v_alpha;
  819. dst4 = res4 * v_alpha;
  820. dst5 = res5 * v_alpha;
  821. dst6 = res6 * v_alpha;
  822. dst7 = res7 * v_alpha;
  823. #else
  824. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
  825. LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
  826. dst0 += res0 * v_alpha;
  827. dst1 += res1 * v_alpha;
  828. dst2 += res2 * v_alpha;
  829. dst3 += res3 * v_alpha;
  830. dst4 += res4 * v_alpha;
  831. dst5 += res5 * v_alpha;
  832. dst6 += res6 * v_alpha;
  833. dst7 += res7 * v_alpha;
  834. #endif
  835. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
  836. ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
  837. #if defined(TRMMKERNEL)
  838. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  839. temp = k - off;
  840. #ifdef LEFT
  841. temp -= 8; // number of values in A
  842. #else
  843. temp -= 2; // number of values in B
  844. #endif
  845. pa0 += temp * 8;
  846. pb0 += temp * 2;
  847. #endif
  848. #ifdef LEFT
  849. off += 8; // number of values in A
  850. #endif
  851. #endif // #if defined(TRMMKERNEL)
  852. }
  853. if (m & 4)
  854. {
  855. #if defined(TRMMKERNEL)
  856. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  857. pb0 = B;
  858. #else
  859. pa0 += off * 4;
  860. pb0 = B + off * 2;
  861. #endif
  862. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  863. temp = k - off;
  864. #elif defined(LEFT)
  865. temp = off + 4; // number of values in A
  866. #else
  867. temp = off + 2; // number of values in B
  868. #endif
  869. #else // #if !defined(TRMMKERNEL)
  870. pb0 = B;
  871. temp = k;
  872. #endif
  873. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  874. src_b0 = LD_DP(pb0);
  875. pb0 += 2;
  876. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  877. res0 = src_a0 * src_b;
  878. res1 = src_a1 * src_b;
  879. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  880. res2 = src_a0 * src_b;
  881. res3 = src_a1 * src_b;
  882. for (l = ((temp - 1) >> 1); l--;)
  883. {
  884. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  885. src_b0 = LD_DP(pb0);
  886. pb0 += 2;
  887. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  888. res0 += src_a0 * src_b;
  889. res1 += src_a1 * src_b;
  890. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  891. res2 += src_a0 * src_b;
  892. res3 += src_a1 * src_b;
  893. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  894. src_b0 = LD_DP(pb0);
  895. pb0 += 2;
  896. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  897. res0 += src_a0 * src_b;
  898. res1 += src_a1 * src_b;
  899. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  900. res2 += src_a0 * src_b;
  901. res3 += src_a1 * src_b;
  902. }
  903. if ((temp - 1) & 1)
  904. {
  905. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  906. src_b0 = LD_DP(pb0);
  907. pb0 += 2;
  908. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  909. res0 += src_a0 * src_b;
  910. res1 += src_a1 * src_b;
  911. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  912. res2 += src_a0 * src_b;
  913. res3 += src_a1 * src_b;
  914. }
  915. #if defined(TRMMKERNEL)
  916. dst0 = res0 * v_alpha;
  917. dst1 = res1 * v_alpha;
  918. dst2 = res2 * v_alpha;
  919. dst3 = res3 * v_alpha;
  920. #else
  921. LD_DP2(pc0, 2, dst0, dst1);
  922. LD_DP2(pc1, 2, dst2, dst3);
  923. dst0 += res0 * v_alpha;
  924. dst1 += res1 * v_alpha;
  925. dst2 += res2 * v_alpha;
  926. dst3 += res3 * v_alpha;
  927. #endif
  928. ST_DP2_INC(dst0, dst1, pc0, 2);
  929. ST_DP2_INC(dst2, dst3, pc1, 2);
  930. #if defined(TRMMKERNEL)
  931. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  932. temp = k - off;
  933. #ifdef LEFT
  934. temp -= 4; // number of values in A
  935. #else
  936. temp -= 2; // number of values in B
  937. #endif
  938. pa0 += temp * 4;
  939. pb0 += temp * 2;
  940. #endif
  941. #ifdef LEFT
  942. off += 4; // number of values in A
  943. #endif
  944. #endif // #if defined(TRMMKERNEL)
  945. }
  946. if (m & 2)
  947. {
  948. #if defined(TRMMKERNEL)
  949. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  950. pb0 = B;
  951. #else
  952. pa0 += off * 2;
  953. pb0 = B + off * 2;
  954. #endif
  955. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  956. temp = k - off;
  957. #elif defined(LEFT)
  958. temp = off + 2; // number of values in A
  959. #else
  960. temp = off + 2; // number of values in B
  961. #endif
  962. #else // #if !defined(TRMMKERNEL)
  963. pb0 = B;
  964. temp = k;
  965. #endif
  966. src_a0 = LD_DP(pa0);
  967. pa0 += 2;
  968. src_b0 = LD_DP(pb0);
  969. pb0 += 2;
  970. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  971. res0 = src_a0 * src_b;
  972. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  973. res1 = src_a0 * src_b;
  974. for (l = ((temp - 1) >> 1); l--;)
  975. {
  976. src_a0 = LD_DP(pa0);
  977. pa0 += 2;
  978. src_b0 = LD_DP(pb0);
  979. pb0 += 2;
  980. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  981. res0 += src_a0 * src_b;
  982. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  983. res1 += src_a0 * src_b;
  984. src_a0 = LD_DP(pa0);
  985. pa0 += 2;
  986. src_b0 = LD_DP(pb0);
  987. pb0 += 2;
  988. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  989. res0 += src_a0 * src_b;
  990. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  991. res1 += src_a0 * src_b;
  992. }
  993. if ((temp - 1) & 1)
  994. {
  995. src_a0 = LD_DP(pa0);
  996. pa0 += 2;
  997. src_b0 = LD_DP(pb0);
  998. pb0 += 2;
  999. src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
  1000. res0 += src_a0 * src_b;
  1001. src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
  1002. res1 += src_a0 * src_b;
  1003. }
  1004. #if defined(TRMMKERNEL)
  1005. dst0 = res0 * v_alpha;
  1006. dst1 = res1 * v_alpha;
  1007. #else
  1008. dst0 = LD_DP(pc0);
  1009. dst1 = LD_DP(pc1);
  1010. dst0 += res0 * v_alpha;
  1011. dst1 += res1 * v_alpha;
  1012. #endif
  1013. ST_DP(dst0, pc0);
  1014. ST_DP(dst1, pc1);
  1015. pc0 += 2;
  1016. pc1 += 2;
  1017. #if defined(TRMMKERNEL)
  1018. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1019. temp = k - off;
  1020. #ifdef LEFT
  1021. temp -= 2; // number of values in A
  1022. #else
  1023. temp -= 2; // number of values in B
  1024. #endif
  1025. pa0 += temp * 2;
  1026. pb0 += temp * 2;
  1027. #endif
  1028. #ifdef LEFT
  1029. off += 2; // number of values in A
  1030. #endif
  1031. #endif // #if defined(TRMMKERNEL)
  1032. }
  1033. if (m & 1)
  1034. {
  1035. #if defined(TRMMKERNEL)
  1036. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1037. pb0 = B;
  1038. #else
  1039. pa0 += off * 1;
  1040. pb0 = B + off * 2;
  1041. #endif
  1042. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1043. temp = k - off;
  1044. #elif defined(LEFT)
  1045. temp = off + 1; // number of values in A
  1046. #else
  1047. temp = off + 2; // number of values in B
  1048. #endif
  1049. #else // #if !defined(TRMMKERNEL)
  1050. pb0 = B;
  1051. temp = k;
  1052. #endif
  1053. a0 = pa0[0];
  1054. b0 = pb0[0];
  1055. tmp0 = a0 * b0;
  1056. b1 = pb0[1];
  1057. tmp1 = a0 * b1;
  1058. pa0 += 1;
  1059. pb0 += 2;
  1060. for (l = ((temp - 1) >> 1); l--;)
  1061. {
  1062. a0 = pa0[0];
  1063. b0 = pb0[0];
  1064. tmp0 += a0 * b0;
  1065. b1 = pb0[1];
  1066. tmp1 += a0 * b1;
  1067. pa0 += 1;
  1068. pb0 += 2;
  1069. a0 = pa0[0];
  1070. b0 = pb0[0];
  1071. tmp0 += a0 * b0;
  1072. b1 = pb0[1];
  1073. tmp1 += a0 * b1;
  1074. pa0 += 1;
  1075. pb0 += 2;
  1076. }
  1077. if ((temp - 1) & 1)
  1078. {
  1079. a0 = pa0[0];
  1080. b0 = pb0[0];
  1081. tmp0 += a0 * b0;
  1082. b1 = pb0[1];
  1083. tmp1 += a0 * b1;
  1084. pa0 += 1;
  1085. pb0 += 2;
  1086. }
  1087. tmp0 = alpha * tmp0;
  1088. tmp1 = alpha * tmp1;
  1089. #if defined(TRMMKERNEL)
  1090. pc0[0] = tmp0;
  1091. pc1[0] = tmp1;
  1092. #else
  1093. pc0[0] += tmp0;
  1094. pc1[0] += tmp1;
  1095. #endif
  1096. pc0 += 1;
  1097. pc1 += 1;
  1098. #if defined(TRMMKERNEL)
  1099. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1100. temp = k - off;
  1101. #ifdef LEFT
  1102. temp -= 1; // number of values in A
  1103. #else
  1104. temp -= 2; // number of values in B
  1105. #endif
  1106. pa0 += temp * 1;
  1107. pb0 += temp * 2;
  1108. #endif
  1109. #ifdef LEFT
  1110. off += 1; // number of values in A
  1111. #endif
  1112. #endif // #if defined(TRMMKERNEL)
  1113. }
  1114. #if defined(TRMMKERNEL) && !defined(LEFT)
  1115. off += 2; // number of values in A
  1116. #endif
  1117. B += (k << 1);
  1118. C += (ldc << 1);
  1119. }
  1120. if (n & 1)
  1121. {
  1122. pc0 = C;
  1123. #if defined(TRMMKERNEL) && defined(LEFT)
  1124. off = offset;
  1125. #endif
  1126. pa0 = A;
  1127. for (i = (m >> 3); i--;)
  1128. {
  1129. #if defined(TRMMKERNEL)
  1130. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1131. pb0 = B;
  1132. #else
  1133. pa0 += off * 8;
  1134. pb0 = B + off * 1;
  1135. #endif
  1136. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1137. temp = k - off;
  1138. #elif defined(LEFT)
  1139. temp = off + 8; // number of values in A
  1140. #else
  1141. temp = off + 1; // number of values in B
  1142. #endif
  1143. #else // #if !defined(TRMMKERNEL)
  1144. pb0 = B;
  1145. temp = k;
  1146. #endif
  1147. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  1148. src_b[0] = pb0[0];
  1149. src_b[1] = pb0[0];
  1150. res0 = src_a0 * src_b;
  1151. res1 = src_a1 * src_b;
  1152. res2 = src_a2 * src_b;
  1153. res3 = src_a3 * src_b;
  1154. pb0 += 1;
  1155. for (l = ((temp - 1) >> 1); l--;)
  1156. {
  1157. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  1158. src_b[0] = pb0[0];
  1159. src_b[1] = pb0[0];
  1160. res0 += src_a0 * src_b;
  1161. res1 += src_a1 * src_b;
  1162. res2 += src_a2 * src_b;
  1163. res3 += src_a3 * src_b;
  1164. pb0 += 1;
  1165. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  1166. src_b[0] = pb0[0];
  1167. src_b[1] = pb0[0];
  1168. res0 += src_a0 * src_b;
  1169. res1 += src_a1 * src_b;
  1170. res2 += src_a2 * src_b;
  1171. res3 += src_a3 * src_b;
  1172. pb0 += 1;
  1173. }
  1174. if ((temp - 1) & 1)
  1175. {
  1176. LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
  1177. src_b[0] = pb0[0];
  1178. src_b[1] = pb0[0];
  1179. res0 += src_a0 * src_b;
  1180. res1 += src_a1 * src_b;
  1181. res2 += src_a2 * src_b;
  1182. res3 += src_a3 * src_b;
  1183. pb0 += 1;
  1184. }
  1185. #if defined(TRMMKERNEL)
  1186. dst0 = res0 * v_alpha;
  1187. dst1 = res1 * v_alpha;
  1188. dst2 = res2 * v_alpha;
  1189. dst3 = res3 * v_alpha;
  1190. #else
  1191. LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
  1192. dst0 += res0 * v_alpha;
  1193. dst1 += res1 * v_alpha;
  1194. dst2 += res2 * v_alpha;
  1195. dst3 += res3 * v_alpha;
  1196. #endif
  1197. ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
  1198. #if defined(TRMMKERNEL)
  1199. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1200. temp = k - off;
  1201. #ifdef LEFT
  1202. temp -= 8; // number of values in A
  1203. #else
  1204. temp -= 1; // number of values in B
  1205. #endif
  1206. pa0 += temp * 8;
  1207. pb0 += temp * 1;
  1208. #endif
  1209. #ifdef LEFT
  1210. off += 8; // number of values in A
  1211. #endif
  1212. #endif // #if defined(TRMMKERNEL)
  1213. }
  1214. if (m & 4)
  1215. {
  1216. #if defined(TRMMKERNEL)
  1217. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1218. pb0 = B;
  1219. #else
  1220. pa0 += off * 4;
  1221. pb0 = B + off * 1;
  1222. #endif
  1223. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1224. temp = k - off;
  1225. #elif defined(LEFT)
  1226. temp = off + 4; // number of values in A
  1227. #else
  1228. temp = off + 1; // number of values in B
  1229. #endif
  1230. #else // #if !defined(TRMMKERNEL)
  1231. pb0 = B;
  1232. temp = k;
  1233. #endif
  1234. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  1235. src_b[0] = pb0[0];
  1236. src_b[1] = pb0[0];
  1237. res0 = src_a0 * src_b;
  1238. res1 = src_a1 * src_b;
  1239. pb0 += 1;
  1240. for (l = ((temp - 1) >> 1); l--;)
  1241. {
  1242. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  1243. src_b[0] = pb0[0];
  1244. src_b[1] = pb0[0];
  1245. res0 += src_a0 * src_b;
  1246. res1 += src_a1 * src_b;
  1247. pb0 += 1;
  1248. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  1249. src_b[0] = pb0[0];
  1250. src_b[1] = pb0[0];
  1251. res0 += src_a0 * src_b;
  1252. res1 += src_a1 * src_b;
  1253. pb0 += 1;
  1254. }
  1255. if ((temp - 1) & 1)
  1256. {
  1257. LD_DP2_INC(pa0, 2, src_a0, src_a1);
  1258. src_b[0] = pb0[0];
  1259. src_b[1] = pb0[0];
  1260. res0 += src_a0 * src_b;
  1261. res1 += src_a1 * src_b;
  1262. pb0 += 1;
  1263. }
  1264. #if defined(TRMMKERNEL)
  1265. dst0 = res0 * v_alpha;
  1266. dst1 = res1 * v_alpha;
  1267. #else
  1268. LD_DP2(pc0, 2, dst0, dst1);
  1269. dst0 += res0 * v_alpha;
  1270. dst1 += res1 * v_alpha;
  1271. #endif
  1272. ST_DP2_INC(dst0, dst1, pc0, 2);
  1273. #if defined(TRMMKERNEL)
  1274. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1275. temp = k - off;
  1276. #ifdef LEFT
  1277. temp -= 4; // number of values in A
  1278. #else
  1279. temp -= 1; // number of values in B
  1280. #endif
  1281. pa0 += temp * 4;
  1282. pb0 += temp * 1;
  1283. #endif
  1284. #ifdef LEFT
  1285. off += 4; // number of values in A
  1286. #endif
  1287. #endif // #if defined(TRMMKERNEL)
  1288. }
  1289. if (m & 2)
  1290. {
  1291. #if defined(TRMMKERNEL)
  1292. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1293. pb0 = B;
  1294. #else
  1295. pa0 += off * 2;
  1296. pb0 = B + off * 1;
  1297. #endif
  1298. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1299. temp = k - off;
  1300. #elif defined(LEFT)
  1301. temp = off + 2; // number of values in A
  1302. #else
  1303. temp = off + 1; // number of values in B
  1304. #endif
  1305. #else // #if !defined(TRMMKERNEL)
  1306. pb0 = B;
  1307. temp = k;
  1308. #endif
  1309. src_a0 = LD_DP(pa0);
  1310. src_b[0] = pb0[0];
  1311. src_b[1] = pb0[0];
  1312. res0 = src_a0 * src_b;
  1313. pa0 += 2;
  1314. pb0 += 1;
  1315. for (l = ((temp - 1) >> 1); l--;)
  1316. {
  1317. src_a0 = LD_DP(pa0);
  1318. src_b[0] = pb0[0];
  1319. src_b[1] = pb0[0];
  1320. res0 += src_a0 * src_b;
  1321. pa0 += 2;
  1322. pb0 += 1;
  1323. src_a0 = LD_DP(pa0);
  1324. src_b[0] = pb0[0];
  1325. src_b[1] = pb0[0];
  1326. res0 += src_a0 * src_b;
  1327. pa0 += 2;
  1328. pb0 += 1;
  1329. }
  1330. if ((temp - 1) & 1)
  1331. {
  1332. src_a0 = LD_DP(pa0);
  1333. src_b[0] = pb0[0];
  1334. src_b[1] = pb0[0];
  1335. res0 += src_a0 * src_b;
  1336. pa0 += 2;
  1337. pb0 += 1;
  1338. }
  1339. #if defined(TRMMKERNEL)
  1340. dst0 = res0 * v_alpha;
  1341. #else
  1342. dst0 = LD_DP(pc0);
  1343. dst0 += res0 * v_alpha;
  1344. #endif
  1345. ST_DP(dst0, pc0);
  1346. pc0 += 2;
  1347. #if defined(TRMMKERNEL)
  1348. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1349. temp = k - off;
  1350. #ifdef LEFT
  1351. temp -= 2; // number of values in A
  1352. #else
  1353. temp -= 1; // number of values in B
  1354. #endif
  1355. pa0 += temp * 2;
  1356. pb0 += temp * 1;
  1357. #endif
  1358. #ifdef LEFT
  1359. off += 2; // number of values in A
  1360. #endif
  1361. #endif // #if defined(TRMMKERNEL)
  1362. }
  1363. if (m & 1)
  1364. {
  1365. #if defined(TRMMKERNEL)
  1366. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1367. pb0 = B;
  1368. #else
  1369. pa0 += off * 1;
  1370. pb0 = B + off * 1;
  1371. #endif
  1372. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1373. temp = k - off;
  1374. #elif defined(LEFT)
  1375. temp = off + 1; // number of values in A
  1376. #else
  1377. temp = off + 1; // number of values in B
  1378. #endif
  1379. #else // #if !defined(TRMMKERNEL)
  1380. pb0 = B;
  1381. temp = k;
  1382. #endif
  1383. a0 = pa0[0];
  1384. b0 = pb0[0];
  1385. tmp0 = a0 * b0;
  1386. pa0 += 1;
  1387. pb0 += 1;
  1388. for (l = ((temp - 1) >> 1); l--;)
  1389. {
  1390. a0 = pa0[0];
  1391. b0 = pb0[0];
  1392. tmp0 += a0 * b0;
  1393. pa0 += 1;
  1394. pb0 += 1;
  1395. a0 = pa0[0];
  1396. b0 = pb0[0];
  1397. tmp0 += a0 * b0;
  1398. pa0 += 1;
  1399. pb0 += 1;
  1400. }
  1401. if ((temp - 1) & 1)
  1402. {
  1403. a0 = pa0[0];
  1404. b0 = pb0[0];
  1405. tmp0 += a0 * b0;
  1406. pa0 += 1;
  1407. pb0 += 1;
  1408. }
  1409. #if defined(TRMMKERNEL)
  1410. pc0[0] = alpha * tmp0;
  1411. #else
  1412. pc0[0] += alpha * tmp0;
  1413. #endif
  1414. }
  1415. }
  1416. }
  1417. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
  1418. FLOAT *C, BLASLONG ldc
  1419. #ifdef TRMMKERNEL
  1420. , BLASLONG offset
  1421. #endif
  1422. )
  1423. {
  1424. if (n >> 2)
  1425. {
  1426. if (m >> 3)
  1427. #ifdef TRMMKERNEL
  1428. dgemmkernel_8x4_core_msa(m, n, k, alpha, A, B, C, ldc, offset);
  1429. #else
  1430. dgemmkernel_8x4_core_msa(m, n, k, alpha, A, B, C, ldc);
  1431. #endif
  1432. if (m & 7)
  1433. #ifdef TRMMKERNEL
  1434. dgemmkernel_7x4_core_msa(m, n, k, alpha, A, B, C, ldc, offset);
  1435. #else
  1436. dgemmkernel_7x4_core_msa(m, n, k, alpha, A, B, C, ldc);
  1437. #endif
  1438. }
  1439. if (n & 3)
  1440. {
  1441. B = B + (k << 2) * (n >> 2);
  1442. C = C + (ldc << 2) * (n >> 2);
  1443. #ifdef TRMMKERNEL
  1444. dgemmkernel_8x4_non_core_msa(m, n, k, alpha, A, B, C, ldc, offset);
  1445. #else
  1446. dgemmkernel_8x4_non_core_msa(m, n, k, alpha, A, B, C, ldc);
  1447. #endif
  1448. }
  1449. return 0;
  1450. }