You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_power10.c 33 kB


  1. /*********************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #include "common.h"
  28. #include <altivec.h>
  29. typedef __vector unsigned char vec_t;
  30. typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
  31. typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
  32. #if defined(TRMMKERNEL)
  33. #define SAVE_ACC(ACC, J) \
  34. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  35. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  36. rowC[0] = result[0] * alpha; \
  37. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  38. rowC[0] = result[1] * alpha; \
  39. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  40. rowC[0] = result[2] * alpha; \
  41. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  42. rowC[0] = result[3] * alpha;
  43. #define SAVE_ACC1(ACC, J) \
  44. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  45. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  46. rowC[0] = result[0] * alpha; \
  47. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  48. rowC[0] = result[1] * alpha; \
  49. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  50. rowC[0] = result[2] * alpha; \
  51. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  52. rowC[0] = result[3] * alpha;
  53. #define SAVE4x2_ACC(ACC, J) \
  54. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  55. rowC = (v2sf_t *) &CO[0* ldc+J]; \
  56. rowC[0] = result[0] * alpha; \
  57. rowC = (v2sf_t *) &CO[1* ldc+J]; \
  58. rowC[0] = result[2] * alpha; \
  59. rowC = (v2sf_t *) &CO[2* ldc+J]; \
  60. rowC[0] = result[4] * alpha; \
  61. rowC = (v2sf_t *) &CO[3* ldc+J]; \
  62. rowC[0] = result[6] * alpha;
  63. #define SAVE4x2_ACC1(ACC, J) \
  64. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  65. rowC = (v2sf_t *) &CO[4* ldc+J]; \
  66. rowC[0] = result[0] * alpha; \
  67. rowC = (v2sf_t *) &CO[5* ldc+J]; \
  68. rowC[0] = result[2] * alpha; \
  69. rowC = (v2sf_t *) &CO[6* ldc+J]; \
  70. rowC[0] = result[4] * alpha; \
  71. rowC = (v2sf_t *) &CO[7* ldc+J]; \
  72. rowC[0] = result[6] * alpha;
  73. #define SAVE2x4_ACC(ACC, J) \
  74. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  75. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  76. rowC[0] = result[0] * alpha; \
  77. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  78. rowC[0] = result[1] * alpha;
  79. #else
  80. #define SAVE_ACC(ACC, J) \
  81. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  82. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  83. rowC[0] += result[0] * alpha; \
  84. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  85. rowC[0] += result[1] * alpha; \
  86. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  87. rowC[0] += result[2] * alpha; \
  88. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  89. rowC[0] += result[3] * alpha;
  90. #define SAVE_ACC1(ACC, J) \
  91. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  92. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  93. rowC[0] += result[0] * alpha; \
  94. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  95. rowC[0] += result[1] * alpha; \
  96. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  97. rowC[0] += result[2] * alpha; \
  98. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  99. rowC[0] += result[3] * alpha;
  100. #define SAVE4x2_ACC(ACC, J) \
  101. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  102. rowC = (v2sf_t *) &CO[0* ldc+J]; \
  103. rowC[0] += result[0] * alpha; \
  104. rowC = (v2sf_t *) &CO[1* ldc+J]; \
  105. rowC[0] += result[2] * alpha; \
  106. rowC = (v2sf_t *) &CO[2* ldc+J]; \
  107. rowC[0] += result[4] * alpha; \
  108. rowC = (v2sf_t *) &CO[3* ldc+J]; \
  109. rowC[0] += result[6] * alpha;
  110. #define SAVE4x2_ACC1(ACC, J) \
  111. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  112. rowC = (v2sf_t *) &CO[4* ldc+J]; \
  113. rowC[0] += result[0] * alpha; \
  114. rowC = (v2sf_t *) &CO[5* ldc+J]; \
  115. rowC[0] += result[2] * alpha; \
  116. rowC = (v2sf_t *) &CO[6* ldc+J]; \
  117. rowC[0] += result[4] * alpha; \
  118. rowC = (v2sf_t *) &CO[7* ldc+J]; \
  119. rowC[0] += result[6] * alpha;
  120. #define SAVE2x4_ACC(ACC, J) \
  121. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  122. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  123. rowC[0] += result[0] * alpha; \
  124. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  125. rowC[0] += result[1] * alpha;
  126. #endif
  127. #define KERNEL(i, j) \
  128. __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
  129. __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
  130. __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
  131. __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
  132. __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
  133. __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
  134. __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
  135. __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
  136. #define SET_ACC_ZERO4() \
  137. __builtin_mma_xxsetaccz (&acc0); \
  138. __builtin_mma_xxsetaccz (&acc1); \
  139. __builtin_mma_xxsetaccz (&acc2); \
  140. __builtin_mma_xxsetaccz (&acc3);
  141. #define SET_ACC_ZERO8() \
  142. __builtin_mma_xxsetaccz (&acc0); \
  143. __builtin_mma_xxsetaccz (&acc1); \
  144. __builtin_mma_xxsetaccz (&acc2); \
  145. __builtin_mma_xxsetaccz (&acc3); \
  146. __builtin_mma_xxsetaccz (&acc4); \
  147. __builtin_mma_xxsetaccz (&acc5); \
  148. __builtin_mma_xxsetaccz (&acc6); \
  149. __builtin_mma_xxsetaccz (&acc7);
  150. #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
  151. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  152. #define REFRESH_TEMP_BK(x, y) \
  153. temp = k - off;
  154. #elif defined(LEFT)
  155. #define REFRESH_TEMP_BK(x, y) \
  156. temp = off + x;
  157. #else
  158. #define REFRESH_TEMP_BK(x, y) \
  159. temp = off + y;
  160. #endif
  161. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  162. #define REFRESH_POINTERS(x, y) \
  163. BO = B; \
  164. REFRESH_TEMP_BK(x, y)
  165. #else
  166. #define REFRESH_POINTERS(x, y) \
  167. AO += off * x; \
  168. BO = B + off * y; \
  169. REFRESH_TEMP_BK(x, y)
  170. #endif
  171. #ifdef LEFT
  172. #define REFRESH_OFF(x) \
  173. off += x;
  174. #else
  175. #define REFRESH_OFF(x)
  176. #endif
  177. #ifdef LEFT
  178. #define UPDATE_TEMP(x, y) \
  179. temp -= x;
  180. #else
  181. #define UPDATE_TEMP(x, y) \
  182. temp -= y;
  183. #endif
  184. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  185. #define REFRESH_TMP_AFTER_SAVE(x, y) \
  186. temp = k - off; \
  187. UPDATE_TEMP(x, y) \
  188. AO += temp * x; \
  189. BO += temp * y;
  190. #else
  191. #define REFRESH_TMP_AFTER_SAVE(x, y)
  192. #endif
  193. #define REFRESH_AFTER_SAVE(x,y) \
  194. REFRESH_TMP_AFTER_SAVE(x, y) \
  195. REFRESH_OFF(x)
  196. /*************************************************************************************
  197. * GEMM Kernel
  198. *************************************************************************************/
  199. int
  200. CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
  201. FLOAT * C, BLASLONG ldc
  202. #ifdef TRMMKERNEL
  203. , BLASLONG offset
  204. #endif
  205. )
  206. {
  207. BLASLONG N = n;
  208. BLASLONG i1;
  209. #if defined(TRMMKERNEL)
  210. BLASLONG off;
  211. #endif
  212. #if defined(TRMMKERNEL) && !defined(LEFT)
  213. off = -offset;
  214. #endif
  215. v4sf_t valpha = { alpha, alpha, alpha, alpha };
  216. N = n >> 3;
  217. for (i1 = 0; i1 < N; i1++)
  218. {
  219. BLASLONG i, j, temp;
  220. FLOAT *CO;
  221. FLOAT *AO;
  222. #if defined(TRMMKERNEL) && defined(LEFT)
  223. off = offset;
  224. #endif
  225. CO = C;
  226. C += ldc << 3;
  227. AO = A;
  228. PREFETCH1 (A, 128);
  229. PREFETCH1 (A, 256);
  230. i = m >> 4;
  231. for (j = 0; j < i; j++)
  232. {
  233. FLOAT *BO;
  234. #if defined(TRMMKERNEL)
  235. REFRESH_POINTERS (16, 8);
  236. #else
  237. BO = B;
  238. temp = k;
  239. #endif
  240. v4sf_t *rowC;
  241. v4sf_t result[4];
  242. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  243. SET_ACC_ZERO8 ();
  244. BLASLONG l = 0;
  245. BLASLONG K = temp / 64;
  246. for (l = 0; l < K; l++)
  247. {
  248. vec_t *rowA = (vec_t *) & AO[0];
  249. vec_t *rowB = (vec_t *) & BO[0];
  250. KERNEL (0, 0);
  251. KERNEL (2, 4);
  252. KERNEL (4, 8);
  253. KERNEL (6, 12);
  254. KERNEL (8, 16);
  255. KERNEL (10, 20);
  256. KERNEL (12, 24);
  257. KERNEL (14, 28);
  258. KERNEL (16, 32);
  259. KERNEL (18, 36);
  260. KERNEL (20, 40);
  261. KERNEL (22, 44);
  262. KERNEL (24, 48);
  263. KERNEL (26, 52);
  264. KERNEL (28, 56);
  265. KERNEL (30, 60);
  266. KERNEL (32, 64);
  267. KERNEL (34, 68);
  268. KERNEL (36, 72);
  269. KERNEL (38, 76);
  270. KERNEL (40, 80);
  271. KERNEL (42, 84);
  272. KERNEL (44, 88);
  273. KERNEL (46, 92);
  274. KERNEL (48, 96);
  275. KERNEL (50, 100);
  276. KERNEL (52, 104);
  277. KERNEL (54, 108);
  278. KERNEL (56, 112);
  279. KERNEL (58, 116);
  280. KERNEL (60, 120);
  281. KERNEL (62, 124);
  282. KERNEL (64, 128);
  283. KERNEL (66, 132);
  284. KERNEL (68, 136);
  285. KERNEL (70, 140);
  286. KERNEL (72, 144);
  287. KERNEL (74, 148);
  288. KERNEL (76, 152);
  289. KERNEL (78, 156);
  290. KERNEL (80, 160);
  291. KERNEL (82, 164);
  292. KERNEL (84, 168);
  293. KERNEL (86, 172);
  294. KERNEL (88, 176);
  295. KERNEL (90, 180);
  296. KERNEL (92, 184);
  297. KERNEL (94, 188);
  298. KERNEL (96, 192);
  299. KERNEL (98, 196);
  300. KERNEL (100, 200);
  301. KERNEL (102, 204);
  302. KERNEL (104, 208);
  303. KERNEL (106, 212);
  304. KERNEL (108, 216);
  305. KERNEL (110, 220);
  306. KERNEL (112, 224);
  307. KERNEL (114, 228);
  308. KERNEL (116, 232);
  309. KERNEL (118, 236);
  310. KERNEL (120, 240);
  311. KERNEL (122, 244);
  312. KERNEL (124, 248);
  313. KERNEL (126, 252);
  314. AO += 1024;
  315. BO += 512;
  316. }
  317. if ((temp & 63) >> 5)
  318. {
  319. vec_t *rowA = (vec_t *) & AO[0];
  320. vec_t *rowB = (vec_t *) & BO[0];
  321. KERNEL (0, 0);
  322. KERNEL (2, 4);
  323. KERNEL (4, 8);
  324. KERNEL (6, 12);
  325. KERNEL (8, 16);
  326. KERNEL (10, 20);
  327. KERNEL (12, 24);
  328. KERNEL (14, 28);
  329. KERNEL (16, 32);
  330. KERNEL (18, 36);
  331. KERNEL (20, 40);
  332. KERNEL (22, 44);
  333. KERNEL (24, 48);
  334. KERNEL (26, 52);
  335. KERNEL (28, 56);
  336. KERNEL (30, 60);
  337. KERNEL (32, 64);
  338. KERNEL (34, 68);
  339. KERNEL (36, 72);
  340. KERNEL (38, 76);
  341. KERNEL (40, 80);
  342. KERNEL (42, 84);
  343. KERNEL (44, 88);
  344. KERNEL (46, 92);
  345. KERNEL (48, 96);
  346. KERNEL (50, 100);
  347. KERNEL (52, 104);
  348. KERNEL (54, 108);
  349. KERNEL (56, 112);
  350. KERNEL (58, 116);
  351. KERNEL (60, 120);
  352. KERNEL (62, 124);
  353. AO += 512;
  354. BO += 256;
  355. }
  356. if ((temp & 31) >> 4)
  357. {
  358. vec_t *rowA = (vec_t *) & AO[0];
  359. vec_t *rowB = (vec_t *) & BO[0];
  360. KERNEL (0, 0);
  361. KERNEL (2, 4);
  362. KERNEL (4, 8);
  363. KERNEL (6, 12);
  364. KERNEL (8, 16);
  365. KERNEL (10, 20);
  366. KERNEL (12, 24);
  367. KERNEL (14, 28);
  368. KERNEL (16, 32);
  369. KERNEL (18, 36);
  370. KERNEL (20, 40);
  371. KERNEL (22, 44);
  372. KERNEL (24, 48);
  373. KERNEL (26, 52);
  374. KERNEL (28, 56);
  375. KERNEL (30, 60);
  376. AO += 256;
  377. BO += 128;
  378. }
  379. if ((temp & 15) >> 3)
  380. {
  381. vec_t *rowA = (vec_t *) & AO[0];
  382. vec_t *rowB = (vec_t *) & BO[0];
  383. KERNEL (0, 0);
  384. KERNEL (2, 4);
  385. KERNEL (4, 8);
  386. KERNEL (6, 12);
  387. KERNEL (8, 16);
  388. KERNEL (10, 20);
  389. KERNEL (12, 24);
  390. KERNEL (14, 28);
  391. AO += 128;
  392. BO += 64;
  393. }
  394. if ((temp & 7) >> 2)
  395. {
  396. vec_t *rowA = (vec_t *) & AO[0];
  397. vec_t *rowB = (vec_t *) & BO[0];
  398. KERNEL (0, 0);
  399. KERNEL (2, 4);
  400. KERNEL (4, 8);
  401. KERNEL (6, 12);
  402. AO += 64;
  403. BO += 32;
  404. }
  405. if ((temp & 3) >> 1)
  406. {
  407. vec_t *rowA = (vec_t *) & AO[0];
  408. vec_t *rowB = (vec_t *) & BO[0];
  409. KERNEL (0, 0);
  410. KERNEL (2, 4);
  411. AO += 32;
  412. BO += 16;
  413. }
  414. if ((temp & 1) >> 0)
  415. {
  416. vec_t *rowA = (vec_t *) & AO[0];
  417. vec_t *rowB = (vec_t *) & BO[0];
  418. KERNEL (0, 0);
  419. AO += 16;
  420. BO += 8;
  421. }
  422. SAVE_ACC (&acc0, 0);
  423. SAVE_ACC (&acc2, 4);
  424. SAVE_ACC1 (&acc1, 0);
  425. SAVE_ACC1 (&acc3, 4);
  426. SAVE_ACC (&acc4, 8);
  427. SAVE_ACC (&acc6, 12);
  428. SAVE_ACC1 (&acc5, 8);
  429. SAVE_ACC1 (&acc7, 12);
  430. #if defined(TRMMKERNEL)
  431. REFRESH_AFTER_SAVE (16, 8)
  432. #endif
  433. CO += 16;
  434. }
  435. i = (m & 15) >> 3;
  436. for (j = 0; j < i; j++)
  437. {
  438. FLOAT *BO;
  439. #if defined(TRMMKERNEL)
  440. REFRESH_POINTERS (8, 8);
  441. #else
  442. BO = B;
  443. temp = k;
  444. #endif
  445. v4sf_t *rowC;
  446. v4sf_t result[4];
  447. __vector_quad acc0, acc1, acc2, acc3;
  448. SET_ACC_ZERO4 ();
  449. BLASLONG l = 0;
  450. for (l = 0; l < temp; l++)
  451. {
  452. vec_t *rowA = (vec_t *) & AO[l << 3];
  453. vec_t *rowB = (vec_t *) & BO[l << 3];
  454. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  455. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  456. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
  457. __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
  458. }
  459. SAVE_ACC (&acc0, 0);
  460. SAVE_ACC (&acc2, 4);
  461. SAVE_ACC1 (&acc1, 0);
  462. SAVE_ACC1 (&acc3, 4);
  463. AO += (temp << 3);
  464. BO += (temp << 3);
  465. CO += 8;
  466. #if defined(TRMMKERNEL)
  467. REFRESH_AFTER_SAVE (8, 8)
  468. #endif
  469. }
  470. i = (m & 7) >> 2;
  471. for (j = 0; j < i; j++)
  472. {
  473. FLOAT *BO;
  474. #if defined(TRMMKERNEL)
  475. REFRESH_POINTERS (4, 8);
  476. #else
  477. BO = B;
  478. temp = k;
  479. #endif
  480. v4sf_t *rowC;
  481. v4sf_t result[4];
  482. __vector_quad acc0, acc1;
  483. __builtin_mma_xxsetaccz (&acc0);
  484. __builtin_mma_xxsetaccz (&acc1);
  485. BLASLONG l = 0;
  486. for (l = 0; l < temp; l++)
  487. {
  488. vec_t *rowA = (vec_t *) & AO[l << 2];
  489. vec_t *rowB = (vec_t *) & BO[l << 3];
  490. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  491. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  492. }
  493. SAVE_ACC (&acc0, 0);
  494. SAVE_ACC1 (&acc1, 0);
  495. CO += 4;
  496. AO += (temp << 2);
  497. BO += (temp << 3);
  498. #if defined(TRMMKERNEL)
  499. REFRESH_AFTER_SAVE (4, 8)
  500. #endif
  501. }
  502. i = (m & 3) >> 1;
  503. for (j = 0; j < i; j++)
  504. {
  505. FLOAT *BO;
  506. #if defined(TRMMKERNEL)
  507. REFRESH_POINTERS (2, 8);
  508. #else
  509. BO = B;
  510. temp = k;
  511. #endif
  512. v2sf_t *rowC;
  513. v2sf_t result[8];
  514. __vector_quad acc0, acc1;
  515. __builtin_mma_xxsetaccz (&acc0);
  516. __builtin_mma_xxsetaccz (&acc1);
  517. BLASLONG l = 0;
  518. for (l = 0; l < temp; l++)
  519. {
  520. FLOAT t[4] = { 0 };
  521. t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
  522. vec_t *rowA = (vec_t *) & t[0];
  523. vec_t *rowB = (vec_t *) & BO[l << 3];
  524. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  525. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  526. }
  527. SAVE4x2_ACC (&acc0, 0);
  528. SAVE4x2_ACC1 (&acc1, 0);
  529. CO += 2;
  530. AO += (temp << 1);
  531. BO += (temp << 3);
  532. #if defined(TRMMKERNEL)
  533. REFRESH_AFTER_SAVE (2, 8)
  534. #endif
  535. }
  536. i = (m & 1) >> 0;
  537. for (j = 0; j < i; j++)
  538. {
  539. FLOAT *BO;
  540. #if defined(TRMMKERNEL)
  541. REFRESH_POINTERS (1, 8);
  542. #else
  543. BO = B;
  544. temp = k;
  545. #endif
  546. BLASLONG l = 0;
  547. v4sf_t t = { 0, 0, 0, 0 };
  548. v4sf_t t1 = { 0, 0, 0, 0 };
  549. for (l = 0; l < temp; l++)
  550. {
  551. v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
  552. v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
  553. BO[(l << 3) + 3]
  554. };
  555. v4sf_t rowB1 =
  556. { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
  557. BO[(l << 3) + 7]
  558. };
  559. t += rowA * rowB;
  560. t1 += rowA * rowB1;
  561. }
  562. t = t * valpha;
  563. t1 = t1 * valpha;
  564. #if defined(TRMMKERNEL)
  565. CO[0 * ldc] = t[0];
  566. CO[1 * ldc] = t[1];
  567. CO[2 * ldc] = t[2];
  568. CO[3 * ldc] = t[3];
  569. CO[4 * ldc] = t1[0];
  570. CO[5 * ldc] = t1[1];
  571. CO[6 * ldc] = t1[2];
  572. CO[7 * ldc] = t1[3];
  573. #else
  574. CO[0 * ldc] += t[0];
  575. CO[1 * ldc] += t[1];
  576. CO[2 * ldc] += t[2];
  577. CO[3 * ldc] += t[3];
  578. CO[4 * ldc] += t1[0];
  579. CO[5 * ldc] += t1[1];
  580. CO[6 * ldc] += t1[2];
  581. CO[7 * ldc] += t1[3];
  582. #endif
  583. CO += 1;
  584. AO += temp;
  585. BO += (temp << 3);
  586. #if defined(TRMMKERNEL)
  587. REFRESH_AFTER_SAVE (1, 8)
  588. #endif
  589. }
  590. #if defined(TRMMKERNEL) && !defined(LEFT)
  591. off += 8; // number of values in A
  592. #endif
  593. B += k << 3;
  594. }
  595. N = (n & 7) >> 2;
  596. for (i1 = 0; i1 < N; i1++)
  597. {
  598. BLASLONG i, j, temp;
  599. #if defined(TRMMKERNEL) && defined(LEFT)
  600. off = offset;
  601. #endif
  602. FLOAT *CO;
  603. FLOAT *AO;
  604. CO = C;
  605. C += ldc << 2;
  606. AO = A;
  607. #if !defined(TRMMKERNEL)
  608. i = m >> 5;
  609. for (j = 0; j < i; j++)
  610. {
  611. FLOAT *BO = B;
  612. v4sf_t *rowC;
  613. v4sf_t result[4];
  614. FLOAT *A1;
  615. A1 = AO + (16 * k);
  616. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  617. SET_ACC_ZERO8 ();
  618. BLASLONG l = 0;
  619. for (l = 0; l < k; l++)
  620. {
  621. vec_t *rowA = (vec_t *) & AO[l << 4];
  622. vec_t *rowA1 = (vec_t *) & A1[l << 4];
  623. vec_t *rowB = (vec_t *) & BO[l << 2];
  624. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  625. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  626. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  627. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  628. __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
  629. __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
  630. __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
  631. __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
  632. }
  633. SAVE_ACC (&acc0, 0);
  634. SAVE_ACC (&acc1, 4);
  635. CO += 8;
  636. SAVE_ACC (&acc2, 0);
  637. SAVE_ACC (&acc3, 4);
  638. CO += 8;
  639. SAVE_ACC (&acc4, 0);
  640. SAVE_ACC (&acc5, 4);
  641. CO += 8;
  642. SAVE_ACC (&acc6, 0);
  643. SAVE_ACC (&acc7, 4);
  644. CO += 8;
  645. AO += k << 5;
  646. BO += k << 2;
  647. }
  648. i = (m & 31) >> 4;
  649. #else
  650. i = m >> 4;
  651. #endif
  652. for (j = 0; j < i; j++)
  653. {
  654. FLOAT *BO;
  655. #if defined(TRMMKERNEL)
  656. REFRESH_POINTERS (16, 4);
  657. #else
  658. BO = B;
  659. temp = k;
  660. #endif
  661. v4sf_t *rowC;
  662. v4sf_t result[4];
  663. __vector_quad acc0, acc1, acc2, acc3;
  664. SET_ACC_ZERO4 ();
  665. BLASLONG l = 0;
  666. for (l = 0; l < temp; l++)
  667. {
  668. vec_t *rowA = (vec_t *) & AO[l << 4];
  669. vec_t *rowB = (vec_t *) & BO[l << 2];
  670. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  671. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  672. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  673. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  674. }
  675. SAVE_ACC (&acc0, 0);
  676. SAVE_ACC (&acc1, 4);
  677. CO += 8;
  678. SAVE_ACC (&acc2, 0);
  679. SAVE_ACC (&acc3, 4);
  680. CO += 8;
  681. AO += temp << 4;
  682. BO += temp << 2;
  683. #if defined(TRMMKERNEL)
  684. REFRESH_AFTER_SAVE (16, 4)
  685. #endif
  686. }
  687. i = (m & 15) >> 3;
  688. for (j = 0; j < i; j++)
  689. {
  690. FLOAT *BO;
  691. #if defined(TRMMKERNEL)
  692. REFRESH_POINTERS (8, 4);
  693. #else
  694. BO = B;
  695. temp = k;
  696. #endif
  697. v4sf_t *rowC;
  698. v4sf_t result[4];
  699. __vector_quad acc0, acc1;
  700. __builtin_mma_xxsetaccz (&acc0);
  701. __builtin_mma_xxsetaccz (&acc1);
  702. BLASLONG l = 0;
  703. for (l = 0; l < temp; l++)
  704. {
  705. vec_t *rowA = (vec_t *) & AO[l << 3];
  706. vec_t *rowB = (vec_t *) & BO[l << 2];
  707. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  708. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  709. }
  710. SAVE_ACC (&acc0, 0);
  711. SAVE_ACC (&acc1, 4);
  712. CO += 8;
  713. AO += temp << 3;
  714. BO += temp << 2;
  715. #if defined(TRMMKERNEL)
  716. REFRESH_AFTER_SAVE (8, 4)
  717. #endif
  718. }
  719. i = (m & 7) >> 2;
  720. for (j = 0; j < i; j++)
  721. {
  722. FLOAT *BO;
  723. #if defined(TRMMKERNEL)
  724. REFRESH_POINTERS (4, 4);
  725. #else
  726. BO = B;
  727. temp = k;
  728. #endif
  729. v4sf_t *rowC;
  730. __vector_quad acc0;
  731. v4sf_t result[4];
  732. __builtin_mma_xxsetaccz (&acc0);
  733. BLASLONG l = 0;
  734. for (l = 0; l < temp; l++)
  735. {
  736. vec_t *rowA = (vec_t *) & AO[l << 2];
  737. vec_t *rowB = (vec_t *) & BO[l << 2];
  738. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  739. }
  740. SAVE_ACC (&acc0, 0);
  741. CO += 4;
  742. AO += temp << 2;
  743. BO += temp << 2;
  744. #if defined(TRMMKERNEL)
  745. REFRESH_AFTER_SAVE (4, 4)
  746. #endif
  747. }
  748. i = (m & 3) >> 1;
  749. for (j = 0; j < i; j++)
  750. {
  751. FLOAT *BO;
  752. #if defined(TRMMKERNEL)
  753. REFRESH_POINTERS (2, 4);
  754. #else
  755. BO = B;
  756. temp = k;
  757. #endif
  758. v2sf_t *rowC;
  759. v2sf_t result[8];
  760. __vector_quad acc0;
  761. __builtin_mma_xxsetaccz (&acc0);
  762. BLASLONG l = 0;
  763. for (l = 0; l < temp; l++)
  764. {
  765. FLOAT t[4] = { 0 };
  766. t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
  767. vec_t *rowA = (vec_t *) & t[0];
  768. vec_t *rowB = (vec_t *) & BO[l << 2];
  769. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  770. }
  771. SAVE4x2_ACC (&acc0, 0);
  772. CO += 2;
  773. AO += temp << 1;
  774. BO += temp << 2;
  775. #if defined(TRMMKERNEL)
  776. REFRESH_AFTER_SAVE (2, 4)
  777. #endif
  778. }
  779. i = (m & 1) >> 0;
  780. for (j = 0; j < i; j++)
  781. {
  782. FLOAT *BO;
  783. #if defined(TRMMKERNEL)
  784. REFRESH_POINTERS (1, 4)
  785. #else
  786. BO = B;
  787. temp = k;
  788. #endif
  789. BLASLONG l = 0;
  790. v4sf_t t = { 0, 0, 0, 0 };
  791. for (l = 0; l < temp; l++)
  792. {
  793. v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
  794. v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
  795. BO[(l << 2) + 3]
  796. };
  797. t += rowA * rowB;
  798. }
  799. t = t * valpha;
  800. #if defined(TRMMKERNEL)
  801. CO[0 * ldc] = t[0];
  802. CO[1 * ldc] = t[1];
  803. CO[2 * ldc] = t[2];
  804. CO[3 * ldc] = t[3];
  805. #else
  806. CO[0 * ldc] += t[0];
  807. CO[1 * ldc] += t[1];
  808. CO[2 * ldc] += t[2];
  809. CO[3 * ldc] += t[3];
  810. #endif
  811. CO += 1;
  812. AO += temp;
  813. BO += temp << 2;
  814. #if defined(TRMMKERNEL)
  815. REFRESH_AFTER_SAVE (1, 4)
  816. #endif
  817. }
  818. #if defined(TRMMKERNEL) && !defined(LEFT)
  819. off += 4; // number of values in A
  820. #endif
  821. B += k << 2;
  822. }
  823. N = (n & 3) >> 1;
  824. for (i1 = 0; i1 < N; i1++)
  825. {
  826. BLASLONG i, j, temp;
  827. #if defined(TRMMKERNEL) && defined(LEFT)
  828. off = offset;
  829. #endif
  830. FLOAT *CO;
  831. FLOAT *AO;
  832. CO = C;
  833. C += ldc << 1;
  834. AO = A;
  835. #if !defined(TRMMKERNEL)
  836. i = m >> 5;
  837. for (j = 0; j < i; j++)
  838. {
  839. FLOAT *BO = B;
  840. v4sf_t *rowC;
  841. v4sf_t result[4];
  842. FLOAT *A1;
  843. A1 = AO + (16 * k);
  844. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  845. SET_ACC_ZERO8 ();
  846. BLASLONG l = 0;
  847. for (l = 0; l < k; l++)
  848. {
  849. FLOAT t[4] = { 0 };
  850. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  851. vec_t *rowB = (vec_t *) & t[0];
  852. vec_t *rowA = (vec_t *) & AO[l << 4];
  853. vec_t *rowA1 = (vec_t *) & A1[l << 4];
  854. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  855. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  856. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  857. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  858. __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
  859. __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
  860. __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
  861. __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
  862. }
  863. SAVE2x4_ACC (&acc0, 0);
  864. SAVE2x4_ACC (&acc1, 4);
  865. SAVE2x4_ACC (&acc2, 8);
  866. SAVE2x4_ACC (&acc3, 12);
  867. CO += 16;
  868. SAVE2x4_ACC (&acc4, 0);
  869. SAVE2x4_ACC (&acc5, 4);
  870. SAVE2x4_ACC (&acc6, 8);
  871. SAVE2x4_ACC (&acc7, 12);
  872. CO += 16;
  873. AO += k << 5;
  874. BO += k << 1;
  875. }
  876. i = (m & 31) >> 4;
  877. #else
  878. i = m >> 4;
  879. #endif
  880. for (j = 0; j < i; j++)
  881. {
  882. FLOAT *BO;
  883. v4sf_t *rowC;
  884. v4sf_t result[4];
  885. __vector_quad acc0, acc1, acc2, acc3;
  886. SET_ACC_ZERO4 ();
  887. BLASLONG l = 0;
  888. #if defined(TRMMKERNEL)
  889. REFRESH_POINTERS (16, 2)
  890. #else
  891. BO = B;
  892. temp = k;
  893. #endif
  894. for (l = 0; l < temp; l++)
  895. {
  896. FLOAT t[4] = { 0 };
  897. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  898. vec_t *rowB = (vec_t *) & t[0];
  899. vec_t *rowA = (vec_t *) & AO[l << 4];
  900. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  901. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  902. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  903. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  904. }
  905. SAVE2x4_ACC (&acc0, 0);
  906. SAVE2x4_ACC (&acc1, 4);
  907. SAVE2x4_ACC (&acc2, 8);
  908. SAVE2x4_ACC (&acc3, 12);
  909. CO += 16;
  910. AO += temp << 4;
  911. BO += temp << 1;
  912. #if defined(TRMMKERNEL)
  913. REFRESH_AFTER_SAVE (16, 2)
  914. #endif
  915. }
  916. i = (m & 15) >> 3;
  917. for (j = 0; j < i; j++)
  918. {
  919. FLOAT *BO;
  920. v4sf_t *rowC;
  921. v4sf_t result[4];
  922. __vector_quad acc0, acc1;
  923. __builtin_mma_xxsetaccz (&acc0);
  924. __builtin_mma_xxsetaccz (&acc1);
  925. #if defined(TRMMKERNEL)
  926. REFRESH_POINTERS (8, 2)
  927. #else
  928. BO = B;
  929. temp = k;
  930. #endif
  931. BLASLONG l = 0;
  932. for (l = 0; l < temp; l++)
  933. {
  934. FLOAT t[4] = { 0 };
  935. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  936. vec_t *rowB = (vec_t *) & t[0];
  937. vec_t *rowA = (vec_t *) & AO[l << 3];
  938. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  939. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  940. }
  941. SAVE2x4_ACC (&acc0, 0);
  942. SAVE2x4_ACC (&acc1, 4);
  943. CO += 8;
  944. AO += temp << 3;
  945. BO += temp << 1;
  946. #if defined(TRMMKERNEL)
  947. REFRESH_AFTER_SAVE (8, 2)
  948. #endif
  949. }
  950. i = (m & 7) >> 2;
  951. for (j = 0; j < i; j++)
  952. {
  953. FLOAT *BO;
  954. v4sf_t *rowC;
  955. v4sf_t result[4];
  956. __vector_quad acc0;
  957. __builtin_mma_xxsetaccz (&acc0);
  958. #if defined(TRMMKERNEL)
  959. REFRESH_POINTERS (4, 2)
  960. #else
  961. BO = B;
  962. temp = k;
  963. #endif
  964. BLASLONG l = 0;
  965. for (l = 0; l < temp; l++)
  966. {
  967. FLOAT t[4] = { 0 };
  968. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  969. vec_t *rowB = (vec_t *) & t[0];
  970. vec_t *rowA = (vec_t *) & AO[l << 2];
  971. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  972. }
  973. SAVE2x4_ACC (&acc0, 0);
  974. CO += 4;
  975. AO += temp << 2;
  976. BO += temp << 1;
  977. #if defined(TRMMKERNEL)
  978. REFRESH_AFTER_SAVE (4, 2)
  979. #endif
  980. }
  981. i = (m & 3) >> 1;
  982. for (j = 0; j < i; j++)
  983. {
  984. FLOAT *BO;
  985. BLASLONG l = 0;
  986. #if defined(TRMMKERNEL)
  987. REFRESH_POINTERS (2, 2)
  988. #else
  989. BO = B;
  990. temp = k;
  991. #endif
  992. v4sf_t t = { 0, 0, 0, 0 };
  993. for (l = 0; l < (temp << 1); l += 2)
  994. {
  995. v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
  996. v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
  997. t += rowA * rowB;
  998. }
  999. t = t * valpha;
  1000. #if defined(TRMMKERNEL)
  1001. CO[0 * ldc] = t[0];
  1002. CO[1 * ldc] = t[1];
  1003. CO[0 * ldc + 1] = t[2];
  1004. CO[1 * ldc + 1] = t[3];
  1005. #else
  1006. CO[0 * ldc] += t[0];
  1007. CO[1 * ldc] += t[1];
  1008. CO[0 * ldc + 1] += t[2];
  1009. CO[1 * ldc + 1] += t[3];
  1010. #endif
  1011. CO += 2;
  1012. AO += temp << 1;
  1013. BO += temp << 1;
  1014. #if defined(TRMMKERNEL)
  1015. REFRESH_AFTER_SAVE (2, 2)
  1016. #endif
  1017. }
  1018. i = (m & 1) >> 0;
  1019. for (j = 0; j < i; j++)
  1020. {
  1021. FLOAT *BO;
  1022. BLASLONG l = 0;
  1023. #if defined(TRMMKERNEL)
  1024. REFRESH_POINTERS (1, 2)
  1025. #else
  1026. BO = B;
  1027. temp = k;
  1028. #endif
  1029. v4sf_t t = { 0, 0, 0, 0 };
  1030. for (l = 0; l < temp; l++)
  1031. {
  1032. v4sf_t rowA = { AO[l], AO[l], 0, 0 };
  1033. v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
  1034. t += rowA * rowB;
  1035. }
  1036. t = t * valpha;
  1037. #if defined(TRMMKERNEL)
  1038. CO[0 * ldc] = t[0];
  1039. CO[1 * ldc] = t[1];
  1040. #else
  1041. CO[0 * ldc] += t[0];
  1042. CO[1 * ldc] += t[1];
  1043. #endif
  1044. CO += 1;
  1045. AO += temp;
  1046. BO += temp << 1;
  1047. #if defined(TRMMKERNEL)
  1048. REFRESH_AFTER_SAVE (1, 2)
  1049. #endif
  1050. }
  1051. #if defined(TRMMKERNEL) && !defined(LEFT)
  1052. off += 2; // number of values in A
  1053. #endif
  1054. B += k << 1;
  1055. }
  1056. N = (n & 1) >> 0;
  1057. for (i1 = 0; i1 < N; i1++)
  1058. {
  1059. BLASLONG i, temp;
  1060. #if defined(TRMMKERNEL) && defined(LEFT)
  1061. off = offset;
  1062. #endif
  1063. FLOAT *CO;
  1064. FLOAT *AO;
  1065. CO = C;
  1066. C += ldc;
  1067. AO = A;
  1068. i = m;
  1069. while (i >= 16)
  1070. {
  1071. FLOAT *BO;
  1072. BLASLONG l = 0;
  1073. #if defined(TRMMKERNEL)
  1074. REFRESH_POINTERS (16, 1)
  1075. #else
  1076. BO = B;
  1077. temp = k;
  1078. #endif
  1079. v4sf_t t = { 0, 0, 0, 0 };
  1080. v4sf_t t1 = { 0, 0, 0, 0 };
  1081. v4sf_t t2 = { 0, 0, 0, 0 };
  1082. v4sf_t t3 = { 0, 0, 0, 0 };
  1083. for (l = 0; l < temp; l++)
  1084. {
  1085. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1086. v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
  1087. AO[(l << 4) + 3]
  1088. };
  1089. v4sf_t rowA1 =
  1090. { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
  1091. AO[(l << 4) + 7]
  1092. };
  1093. v4sf_t rowA2 =
  1094. { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
  1095. AO[(l << 4) + 11]
  1096. };
  1097. v4sf_t rowA3 =
  1098. { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
  1099. AO[(l << 4) + 15]
  1100. };
  1101. t += rowA * rowB;
  1102. t1 += rowA1 * rowB;
  1103. t2 += rowA2 * rowB;
  1104. t3 += rowA3 * rowB;
  1105. }
  1106. t = t * valpha;
  1107. t1 = t1 * valpha;
  1108. t2 = t2 * valpha;
  1109. t3 = t3 * valpha;
  1110. #if defined(TRMMKERNEL)
  1111. CO[0] = t[0];
  1112. CO[1] = t[1];
  1113. CO[2] = t[2];
  1114. CO[3] = t[3];
  1115. CO[4] = t1[0];
  1116. CO[5] = t1[1];
  1117. CO[6] = t1[2];
  1118. CO[7] = t1[3];
  1119. CO[8] = t2[0];
  1120. CO[9] = t2[1];
  1121. CO[10] = t2[2];
  1122. CO[11] = t2[3];
  1123. CO[12] = t3[0];
  1124. CO[13] = t3[1];
  1125. CO[14] = t3[2];
  1126. CO[15] = t3[3];
  1127. #else
  1128. CO[0] += t[0];
  1129. CO[1] += t[1];
  1130. CO[2] += t[2];
  1131. CO[3] += t[3];
  1132. CO[4] += t1[0];
  1133. CO[5] += t1[1];
  1134. CO[6] += t1[2];
  1135. CO[7] += t1[3];
  1136. CO[8] += t2[0];
  1137. CO[9] += t2[1];
  1138. CO[10] += t2[2];
  1139. CO[11] += t2[3];
  1140. CO[12] += t3[0];
  1141. CO[13] += t3[1];
  1142. CO[14] += t3[2];
  1143. CO[15] += t3[3];
  1144. #endif
  1145. AO += temp << 4;
  1146. BO += temp;
  1147. CO += 16;
  1148. i -= 16;
  1149. #if defined(TRMMKERNEL)
  1150. REFRESH_AFTER_SAVE (16, 1)
  1151. #endif
  1152. }
  1153. while (i >= 8)
  1154. {
  1155. FLOAT *BO;
  1156. BLASLONG l = 0;
  1157. v4sf_t t = { 0, 0, 0, 0 };
  1158. v4sf_t t1 = { 0, 0, 0, 0 };
  1159. #if defined(TRMMKERNEL)
  1160. REFRESH_POINTERS (8, 1)
  1161. #else
  1162. BO = B;
  1163. temp = k;
  1164. #endif
  1165. for (l = 0; l < temp; l++)
  1166. {
  1167. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1168. v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
  1169. AO[(l << 3) + 3]
  1170. };
  1171. v4sf_t rowA1 =
  1172. { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
  1173. AO[(l << 3) + 7]
  1174. };
  1175. t += rowA * rowB;
  1176. t1 += rowA1 * rowB;
  1177. }
  1178. t = t * valpha;
  1179. t1 = t1 * valpha;
  1180. #if defined(TRMMKERNEL)
  1181. CO[0] = t[0];
  1182. CO[1] = t[1];
  1183. CO[2] = t[2];
  1184. CO[3] = t[3];
  1185. CO[4] = t1[0];
  1186. CO[5] = t1[1];
  1187. CO[6] = t1[2];
  1188. CO[7] = t1[3];
  1189. #else
  1190. CO[0] += t[0];
  1191. CO[1] += t[1];
  1192. CO[2] += t[2];
  1193. CO[3] += t[3];
  1194. CO[4] += t1[0];
  1195. CO[5] += t1[1];
  1196. CO[6] += t1[2];
  1197. CO[7] += t1[3];
  1198. #endif
  1199. AO += temp << 3;
  1200. BO += temp;
  1201. CO += 8;
  1202. i -= 8;
  1203. #if defined(TRMMKERNEL)
  1204. REFRESH_AFTER_SAVE (8, 1)
  1205. #endif
  1206. }
  1207. while (i >= 4)
  1208. {
  1209. FLOAT *BO;
  1210. BLASLONG l = 0;
  1211. v4sf_t t = { 0, 0, 0, 0 };
  1212. #if defined(TRMMKERNEL)
  1213. REFRESH_POINTERS (4, 1)
  1214. #else
  1215. BO = B;
  1216. temp = k;
  1217. #endif
  1218. for (l = 0; l < temp; l++)
  1219. {
  1220. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1221. v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
  1222. AO[(l << 2) + 3]
  1223. };
  1224. t += rowA * rowB;
  1225. }
  1226. t = t * valpha;
  1227. #if defined(TRMMKERNEL)
  1228. CO[0] = t[0];
  1229. CO[1] = t[1];
  1230. CO[2] = t[2];
  1231. CO[3] = t[3];
  1232. #else
  1233. CO[0] += t[0];
  1234. CO[1] += t[1];
  1235. CO[2] += t[2];
  1236. CO[3] += t[3];
  1237. #endif
  1238. AO += temp << 2;
  1239. BO += temp;
  1240. CO += 4;
  1241. i -= 4;
  1242. #if defined(TRMMKERNEL)
  1243. REFRESH_AFTER_SAVE (4, 1)
  1244. #endif
  1245. }
  1246. while (i >= 2)
  1247. {
  1248. FLOAT *BO;
  1249. BLASLONG l = 0;
  1250. #if defined(TRMMKERNEL)
  1251. REFRESH_POINTERS (2, 1)
  1252. #else
  1253. BO = B;
  1254. temp = k;
  1255. #endif
  1256. v4sf_t t = { 0, 0, 0, 0 };
  1257. for (l = 0; l < temp; l++)
  1258. {
  1259. v4sf_t rowB = { BO[l], BO[l], 0, 0 };
  1260. v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
  1261. t += rowA * rowB;
  1262. }
  1263. t = t * valpha;
  1264. #if defined(TRMMKERNEL)
  1265. CO[0] = t[0];
  1266. CO[1] = t[1];
  1267. #else
  1268. CO[0] += t[0];
  1269. CO[1] += t[1];
  1270. #endif
  1271. AO += temp << 1;
  1272. BO += temp;
  1273. CO += 2;
  1274. i -= 2;
  1275. #if defined(TRMMKERNEL)
  1276. REFRESH_AFTER_SAVE (2, 1)
  1277. #endif
  1278. }
  1279. while (i >= 1)
  1280. {
  1281. FLOAT *BO;
  1282. #if defined(TRMMKERNEL)
  1283. REFRESH_POINTERS (1, 1)
  1284. #else
  1285. BO = B;
  1286. temp = k;
  1287. #endif
  1288. BLASLONG l = 0;
  1289. FLOAT t = 0;
  1290. for (l = 0; l < temp; l++)
  1291. {
  1292. t += AO[l] * BO[l];
  1293. }
  1294. AO += temp;
  1295. BO += temp;
  1296. #if defined(TRMMKERNEL)
  1297. CO[0] = t * alpha;
  1298. #else
  1299. CO[0] += t * alpha;
  1300. #endif
  1301. CO += 1;
  1302. i -= 1;
  1303. #if defined(TRMMKERNEL)
  1304. REFRESH_AFTER_SAVE (1, 1)
  1305. #endif
  1306. }
  1307. #if defined(TRMMKERNEL) && !defined(LEFT)
  1308. off += 1; // number of values in A
  1309. #endif
  1310. B += k;
  1311. }
  1312. return 0;
  1313. }