You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_power10.c 35 kB


  1. /*********************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #include "common.h"
  28. #include <altivec.h>
  29. typedef __vector unsigned char vec_t;
  30. typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
  31. typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
  32. #if defined(TRMMKERNEL)
  33. #define SAVE_ACC(ACC, J) \
  34. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  35. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  36. rowC[0] = result[0] * alpha; \
  37. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  38. rowC[0] = result[1] * alpha; \
  39. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  40. rowC[0] = result[2] * alpha; \
  41. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  42. rowC[0] = result[3] * alpha;
  43. #define SAVE_ACC1(ACC, J) \
  44. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  45. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  46. rowC[0] = result[0] * alpha; \
  47. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  48. rowC[0] = result[1] * alpha; \
  49. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  50. rowC[0] = result[2] * alpha; \
  51. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  52. rowC[0] = result[3] * alpha;
  53. #define SAVE4x2_ACC(ACC, J) \
  54. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  55. rowC = (v2sf_t *) &CO[0* ldc+J]; \
  56. rowC[0] = result[0] * alpha; \
  57. rowC = (v2sf_t *) &CO[1* ldc+J]; \
  58. rowC[0] = result[2] * alpha; \
  59. rowC = (v2sf_t *) &CO[2* ldc+J]; \
  60. rowC[0] = result[4] * alpha; \
  61. rowC = (v2sf_t *) &CO[3* ldc+J]; \
  62. rowC[0] = result[6] * alpha;
  63. #define SAVE4x2_ACC1(ACC, J) \
  64. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  65. rowC = (v2sf_t *) &CO[4* ldc+J]; \
  66. rowC[0] = result[0] * alpha; \
  67. rowC = (v2sf_t *) &CO[5* ldc+J]; \
  68. rowC[0] = result[2] * alpha; \
  69. rowC = (v2sf_t *) &CO[6* ldc+J]; \
  70. rowC[0] = result[4] * alpha; \
  71. rowC = (v2sf_t *) &CO[7* ldc+J]; \
  72. rowC[0] = result[6] * alpha;
  73. #define SAVE2x4_ACC(ACC, J) \
  74. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  75. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  76. rowC[0] = result[0] * alpha; \
  77. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  78. rowC[0] = result[1] * alpha;
  79. #else
  80. #define SAVE_ACC(ACC, J) \
  81. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  82. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  83. rowC[0] += result[0] * alpha; \
  84. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  85. rowC[0] += result[1] * alpha; \
  86. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  87. rowC[0] += result[2] * alpha; \
  88. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  89. rowC[0] += result[3] * alpha;
  90. #define SAVE_ACC1(ACC, J) \
  91. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  92. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  93. rowC[0] += result[0] * alpha; \
  94. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  95. rowC[0] += result[1] * alpha; \
  96. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  97. rowC[0] += result[2] * alpha; \
  98. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  99. rowC[0] += result[3] * alpha;
  100. #define SAVE4x2_ACC(ACC, J) \
  101. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  102. rowC = (v2sf_t *) &CO[0* ldc+J]; \
  103. rowC[0] += result[0] * alpha; \
  104. rowC = (v2sf_t *) &CO[1* ldc+J]; \
  105. rowC[0] += result[2] * alpha; \
  106. rowC = (v2sf_t *) &CO[2* ldc+J]; \
  107. rowC[0] += result[4] * alpha; \
  108. rowC = (v2sf_t *) &CO[3* ldc+J]; \
  109. rowC[0] += result[6] * alpha;
  110. #define SAVE4x2_ACC1(ACC, J) \
  111. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  112. rowC = (v2sf_t *) &CO[4* ldc+J]; \
  113. rowC[0] += result[0] * alpha; \
  114. rowC = (v2sf_t *) &CO[5* ldc+J]; \
  115. rowC[0] += result[2] * alpha; \
  116. rowC = (v2sf_t *) &CO[6* ldc+J]; \
  117. rowC[0] += result[4] * alpha; \
  118. rowC = (v2sf_t *) &CO[7* ldc+J]; \
  119. rowC[0] += result[6] * alpha;
  120. #define SAVE2x4_ACC(ACC, J) \
  121. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  122. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  123. rowC[0] += result[0] * alpha; \
  124. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  125. rowC[0] += result[1] * alpha;
  126. #endif
  127. #define KERNEL(i, j) \
  128. __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
  129. __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
  130. __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
  131. __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
  132. __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
  133. __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
  134. __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
  135. __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
  136. #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
  137. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  138. #define REFRESH_TEMP_BK(x, y) \
  139. temp = k - off;
  140. #elif defined(LEFT)
  141. #define REFRESH_TEMP_BK(x, y) \
  142. temp = off + x;
  143. #else
  144. #define REFRESH_TEMP_BK(x, y) \
  145. temp = off + y;
  146. #endif
  147. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  148. #define REFRESH_POINTERS(x, y) \
  149. BO = B; \
  150. REFRESH_TEMP_BK(x, y)
  151. #else
  152. #define REFRESH_POINTERS(x, y) \
  153. AO += off * x; \
  154. BO = B + off * y; \
  155. REFRESH_TEMP_BK(x, y)
  156. #endif
  157. #ifdef LEFT
  158. #define REFRESH_OFF(x) \
  159. off += x;
  160. #else
  161. #define REFRESH_OFF(x)
  162. #endif
  163. #ifdef LEFT
  164. #define UPDATE_TEMP(x, y) \
  165. temp -= x;
  166. #else
  167. #define UPDATE_TEMP(x, y) \
  168. temp -= y;
  169. #endif
  170. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  171. #define REFRESH_TMP_AFTER_SAVE(x, y) \
  172. temp = k - off; \
  173. UPDATE_TEMP(x, y) \
  174. AO += temp * x; \
  175. BO += temp * y;
  176. #else
  177. #define REFRESH_TMP_AFTER_SAVE(x, y)
  178. #endif
  179. #define REFRESH_AFTER_SAVE(x,y) \
  180. REFRESH_TMP_AFTER_SAVE(x, y) \
  181. REFRESH_OFF(x)
  182. /*************************************************************************************
  183. * GEMM Kernel
  184. *************************************************************************************/
  185. int
  186. CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
  187. FLOAT * C, BLASLONG ldc
  188. #ifdef TRMMKERNEL
  189. , BLASLONG offset
  190. #endif
  191. )
  192. {
  193. BLASLONG i1;
  194. #if defined(TRMMKERNEL)
  195. BLASLONG off;
  196. #endif
  197. #if defined(TRMMKERNEL) && !defined(LEFT)
  198. off = -offset;
  199. #endif
  200. v4sf_t valpha = { alpha, alpha, alpha, alpha };
  201. for (i1 = 0; i1 < (n >> 3); i1++)
  202. {
  203. BLASLONG j, temp;
  204. FLOAT *CO;
  205. FLOAT *AO;
  206. #if defined(TRMMKERNEL) && defined(LEFT)
  207. off = offset;
  208. #endif
  209. CO = C;
  210. C += ldc << 3;
  211. AO = A;
  212. PREFETCH1 (A, 128);
  213. PREFETCH1 (A, 256);
  214. for (j = 0; j < (m >> 4); j++)
  215. {
  216. FLOAT *BO;
  217. #if defined(TRMMKERNEL)
  218. REFRESH_POINTERS (16, 8);
  219. #else
  220. BO = B;
  221. temp = k;
  222. #endif
  223. v4sf_t *rowC;
  224. v4sf_t result[4];
  225. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  226. BLASLONG l = 0;
  227. vec_t *rowA1 = (vec_t *) & AO[0];
  228. vec_t *rowB1 = (vec_t *) & BO[0];
  229. __builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]);
  230. __builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]);
  231. __builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]);
  232. __builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]);
  233. __builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]);
  234. __builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]);
  235. __builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]);
  236. __builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]);
  237. AO += 16;
  238. BO += 8;
  239. temp--;
  240. BLASLONG K = temp / 64;
  241. for (l = 0; l < K; l++)
  242. {
  243. vec_t *rowA = (vec_t *) & AO[0];
  244. vec_t *rowB = (vec_t *) & BO[0];
  245. KERNEL (0, 0);
  246. KERNEL (2, 4);
  247. KERNEL (4, 8);
  248. KERNEL (6, 12);
  249. KERNEL (8, 16);
  250. KERNEL (10, 20);
  251. KERNEL (12, 24);
  252. KERNEL (14, 28);
  253. KERNEL (16, 32);
  254. KERNEL (18, 36);
  255. KERNEL (20, 40);
  256. KERNEL (22, 44);
  257. KERNEL (24, 48);
  258. KERNEL (26, 52);
  259. KERNEL (28, 56);
  260. KERNEL (30, 60);
  261. KERNEL (32, 64);
  262. KERNEL (34, 68);
  263. KERNEL (36, 72);
  264. KERNEL (38, 76);
  265. KERNEL (40, 80);
  266. KERNEL (42, 84);
  267. KERNEL (44, 88);
  268. KERNEL (46, 92);
  269. KERNEL (48, 96);
  270. KERNEL (50, 100);
  271. KERNEL (52, 104);
  272. KERNEL (54, 108);
  273. KERNEL (56, 112);
  274. KERNEL (58, 116);
  275. KERNEL (60, 120);
  276. KERNEL (62, 124);
  277. KERNEL (64, 128);
  278. KERNEL (66, 132);
  279. KERNEL (68, 136);
  280. KERNEL (70, 140);
  281. KERNEL (72, 144);
  282. KERNEL (74, 148);
  283. KERNEL (76, 152);
  284. KERNEL (78, 156);
  285. KERNEL (80, 160);
  286. KERNEL (82, 164);
  287. KERNEL (84, 168);
  288. KERNEL (86, 172);
  289. KERNEL (88, 176);
  290. KERNEL (90, 180);
  291. KERNEL (92, 184);
  292. KERNEL (94, 188);
  293. KERNEL (96, 192);
  294. KERNEL (98, 196);
  295. KERNEL (100, 200);
  296. KERNEL (102, 204);
  297. KERNEL (104, 208);
  298. KERNEL (106, 212);
  299. KERNEL (108, 216);
  300. KERNEL (110, 220);
  301. KERNEL (112, 224);
  302. KERNEL (114, 228);
  303. KERNEL (116, 232);
  304. KERNEL (118, 236);
  305. KERNEL (120, 240);
  306. KERNEL (122, 244);
  307. KERNEL (124, 248);
  308. KERNEL (126, 252);
  309. AO += 1024;
  310. BO += 512;
  311. }
  312. if ((temp & 63) >> 5)
  313. {
  314. vec_t *rowA = (vec_t *) & AO[0];
  315. vec_t *rowB = (vec_t *) & BO[0];
  316. KERNEL (0, 0);
  317. KERNEL (2, 4);
  318. KERNEL (4, 8);
  319. KERNEL (6, 12);
  320. KERNEL (8, 16);
  321. KERNEL (10, 20);
  322. KERNEL (12, 24);
  323. KERNEL (14, 28);
  324. KERNEL (16, 32);
  325. KERNEL (18, 36);
  326. KERNEL (20, 40);
  327. KERNEL (22, 44);
  328. KERNEL (24, 48);
  329. KERNEL (26, 52);
  330. KERNEL (28, 56);
  331. KERNEL (30, 60);
  332. KERNEL (32, 64);
  333. KERNEL (34, 68);
  334. KERNEL (36, 72);
  335. KERNEL (38, 76);
  336. KERNEL (40, 80);
  337. KERNEL (42, 84);
  338. KERNEL (44, 88);
  339. KERNEL (46, 92);
  340. KERNEL (48, 96);
  341. KERNEL (50, 100);
  342. KERNEL (52, 104);
  343. KERNEL (54, 108);
  344. KERNEL (56, 112);
  345. KERNEL (58, 116);
  346. KERNEL (60, 120);
  347. KERNEL (62, 124);
  348. AO += 512;
  349. BO += 256;
  350. }
  351. if ((temp & 31) >> 4)
  352. {
  353. vec_t *rowA = (vec_t *) & AO[0];
  354. vec_t *rowB = (vec_t *) & BO[0];
  355. KERNEL (0, 0);
  356. KERNEL (2, 4);
  357. KERNEL (4, 8);
  358. KERNEL (6, 12);
  359. KERNEL (8, 16);
  360. KERNEL (10, 20);
  361. KERNEL (12, 24);
  362. KERNEL (14, 28);
  363. KERNEL (16, 32);
  364. KERNEL (18, 36);
  365. KERNEL (20, 40);
  366. KERNEL (22, 44);
  367. KERNEL (24, 48);
  368. KERNEL (26, 52);
  369. KERNEL (28, 56);
  370. KERNEL (30, 60);
  371. AO += 256;
  372. BO += 128;
  373. }
  374. if ((temp & 15) >> 3)
  375. {
  376. vec_t *rowA = (vec_t *) & AO[0];
  377. vec_t *rowB = (vec_t *) & BO[0];
  378. KERNEL (0, 0);
  379. KERNEL (2, 4);
  380. KERNEL (4, 8);
  381. KERNEL (6, 12);
  382. KERNEL (8, 16);
  383. KERNEL (10, 20);
  384. KERNEL (12, 24);
  385. KERNEL (14, 28);
  386. AO += 128;
  387. BO += 64;
  388. }
  389. if ((temp & 7) >> 2)
  390. {
  391. vec_t *rowA = (vec_t *) & AO[0];
  392. vec_t *rowB = (vec_t *) & BO[0];
  393. KERNEL (0, 0);
  394. KERNEL (2, 4);
  395. KERNEL (4, 8);
  396. KERNEL (6, 12);
  397. AO += 64;
  398. BO += 32;
  399. }
  400. if ((temp & 3) >> 1)
  401. {
  402. vec_t *rowA = (vec_t *) & AO[0];
  403. vec_t *rowB = (vec_t *) & BO[0];
  404. KERNEL (0, 0);
  405. KERNEL (2, 4);
  406. AO += 32;
  407. BO += 16;
  408. }
  409. if ((temp & 1) >> 0)
  410. {
  411. vec_t *rowA = (vec_t *) & AO[0];
  412. vec_t *rowB = (vec_t *) & BO[0];
  413. KERNEL (0, 0);
  414. AO += 16;
  415. BO += 8;
  416. }
  417. SAVE_ACC (&acc0, 0);
  418. SAVE_ACC (&acc2, 4);
  419. SAVE_ACC1 (&acc1, 0);
  420. SAVE_ACC1 (&acc3, 4);
  421. SAVE_ACC (&acc4, 8);
  422. SAVE_ACC (&acc6, 12);
  423. SAVE_ACC1 (&acc5, 8);
  424. SAVE_ACC1 (&acc7, 12);
  425. #if defined(TRMMKERNEL)
  426. REFRESH_AFTER_SAVE (16, 8)
  427. #endif
  428. CO += 16;
  429. }
  430. if (m & 8)
  431. {
  432. FLOAT *BO;
  433. #if defined(TRMMKERNEL)
  434. REFRESH_POINTERS (8, 8);
  435. #else
  436. BO = B;
  437. temp = k;
  438. #endif
  439. v4sf_t *rowC;
  440. v4sf_t result[4];
  441. __vector_quad acc0, acc1, acc2, acc3;
  442. BLASLONG l = 0;
  443. vec_t *rowA = (vec_t *) & AO[0];
  444. vec_t *rowB = (vec_t *) & BO[0];
  445. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  446. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  447. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]);
  448. __builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]);
  449. for (l = 1; l < temp; l++)
  450. {
  451. rowA = (vec_t *) & AO[l << 3];
  452. rowB = (vec_t *) & BO[l << 3];
  453. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  454. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  455. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
  456. __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
  457. }
  458. SAVE_ACC (&acc0, 0);
  459. SAVE_ACC (&acc2, 4);
  460. SAVE_ACC1 (&acc1, 0);
  461. SAVE_ACC1 (&acc3, 4);
  462. AO += (temp << 3);
  463. BO += (temp << 3);
  464. CO += 8;
  465. #if defined(TRMMKERNEL)
  466. REFRESH_AFTER_SAVE (8, 8)
  467. #endif
  468. }
  469. if (m & 4)
  470. {
  471. FLOAT *BO;
  472. #if defined(TRMMKERNEL)
  473. REFRESH_POINTERS (4, 8);
  474. #else
  475. BO = B;
  476. temp = k;
  477. #endif
  478. v4sf_t *rowC;
  479. v4sf_t result[4];
  480. __vector_quad acc0, acc1;
  481. BLASLONG l = 0;
  482. vec_t *rowA = (vec_t *) & AO[0];
  483. vec_t *rowB = (vec_t *) & BO[0];
  484. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  485. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  486. for (l = 1; l < temp; l++)
  487. {
  488. rowA = (vec_t *) & AO[l << 2];
  489. rowB = (vec_t *) & BO[l << 3];
  490. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  491. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  492. }
  493. SAVE_ACC (&acc0, 0);
  494. SAVE_ACC1 (&acc1, 0);
  495. CO += 4;
  496. AO += (temp << 2);
  497. BO += (temp << 3);
  498. #if defined(TRMMKERNEL)
  499. REFRESH_AFTER_SAVE (4, 8)
  500. #endif
  501. }
  502. if (m & 2)
  503. {
  504. FLOAT *BO;
  505. #if defined(TRMMKERNEL)
  506. REFRESH_POINTERS (2, 8);
  507. #else
  508. BO = B;
  509. temp = k;
  510. #endif
  511. v2sf_t *rowC;
  512. v2sf_t result[8];
  513. __vector_quad acc0, acc1;
  514. BLASLONG l = 0;
  515. FLOAT t[4] = { 0 };
  516. t[0] = AO[0], t[1] = AO[1];
  517. vec_t *rowA = (vec_t *) & t[0];
  518. vec_t *rowB = (vec_t *) & BO[0];
  519. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  520. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  521. for (l = 1; l < temp; l++)
  522. {
  523. t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
  524. rowA = (vec_t *) & t[0];
  525. rowB = (vec_t *) & BO[l << 3];
  526. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  527. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  528. }
  529. SAVE4x2_ACC (&acc0, 0);
  530. SAVE4x2_ACC1 (&acc1, 0);
  531. CO += 2;
  532. AO += (temp << 1);
  533. BO += (temp << 3);
  534. #if defined(TRMMKERNEL)
  535. REFRESH_AFTER_SAVE (2, 8)
  536. #endif
  537. }
  538. if (m & 1)
  539. {
  540. FLOAT *BO;
  541. #if defined(TRMMKERNEL)
  542. REFRESH_POINTERS (1, 8);
  543. #else
  544. BO = B;
  545. temp = k;
  546. #endif
  547. BLASLONG l = 0;
  548. v4sf_t t = { 0, 0, 0, 0 };
  549. v4sf_t t1 = { 0, 0, 0, 0 };
  550. for (l = 0; l < temp; l++)
  551. {
  552. v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
  553. v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
  554. BO[(l << 3) + 3]
  555. };
  556. v4sf_t rowB1 =
  557. { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
  558. BO[(l << 3) + 7]
  559. };
  560. t += rowA * rowB;
  561. t1 += rowA * rowB1;
  562. }
  563. t = t * valpha;
  564. t1 = t1 * valpha;
  565. #if defined(TRMMKERNEL)
  566. CO[0 * ldc] = t[0];
  567. CO[1 * ldc] = t[1];
  568. CO[2 * ldc] = t[2];
  569. CO[3 * ldc] = t[3];
  570. CO[4 * ldc] = t1[0];
  571. CO[5 * ldc] = t1[1];
  572. CO[6 * ldc] = t1[2];
  573. CO[7 * ldc] = t1[3];
  574. #else
  575. CO[0 * ldc] += t[0];
  576. CO[1 * ldc] += t[1];
  577. CO[2 * ldc] += t[2];
  578. CO[3 * ldc] += t[3];
  579. CO[4 * ldc] += t1[0];
  580. CO[5 * ldc] += t1[1];
  581. CO[6 * ldc] += t1[2];
  582. CO[7 * ldc] += t1[3];
  583. #endif
  584. CO += 1;
  585. AO += temp;
  586. BO += (temp << 3);
  587. #if defined(TRMMKERNEL)
  588. REFRESH_AFTER_SAVE (1, 8)
  589. #endif
  590. }
  591. #if defined(TRMMKERNEL) && !defined(LEFT)
  592. off += 8; // number of values in A
  593. #endif
  594. B += k << 3;
  595. }
  596. if (n & 4)
  597. {
  598. BLASLONG i, j, temp;
  599. #if defined(TRMMKERNEL) && defined(LEFT)
  600. off = offset;
  601. #endif
  602. FLOAT *CO;
  603. FLOAT *AO;
  604. CO = C;
  605. C += ldc << 2;
  606. AO = A;
  607. #if !defined(TRMMKERNEL)
  608. i = m >> 5;
  609. for (j = 0; j < i; j++)
  610. {
  611. FLOAT *BO = B;
  612. v4sf_t *rowC;
  613. v4sf_t result[4];
  614. FLOAT *A1;
  615. A1 = AO + (16 * k);
  616. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  617. BLASLONG l = 0;
  618. vec_t *rowA = (vec_t *) & AO[0];
  619. vec_t *rowA1 = (vec_t *) & A1[0];
  620. vec_t *rowB = (vec_t *) & BO[0];
  621. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  622. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  623. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  624. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  625. __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
  626. __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
  627. __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
  628. __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
  629. for (l = 1; l < k; l++)
  630. {
  631. rowA = (vec_t *) & AO[l << 4];
  632. rowA1 = (vec_t *) & A1[l << 4];
  633. rowB = (vec_t *) & BO[l << 2];
  634. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  635. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  636. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  637. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  638. __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
  639. __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
  640. __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
  641. __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
  642. }
  643. SAVE_ACC (&acc0, 0);
  644. SAVE_ACC (&acc1, 4);
  645. CO += 8;
  646. SAVE_ACC (&acc2, 0);
  647. SAVE_ACC (&acc3, 4);
  648. CO += 8;
  649. SAVE_ACC (&acc4, 0);
  650. SAVE_ACC (&acc5, 4);
  651. CO += 8;
  652. SAVE_ACC (&acc6, 0);
  653. SAVE_ACC (&acc7, 4);
  654. CO += 8;
  655. AO += k << 5;
  656. BO += k << 2;
  657. }
  658. i = (m & 31) >> 4;
  659. #else
  660. i = m >> 4;
  661. #endif
  662. for (j = 0; j < i; j++)
  663. {
  664. FLOAT *BO;
  665. #if defined(TRMMKERNEL)
  666. REFRESH_POINTERS (16, 4);
  667. #else
  668. BO = B;
  669. temp = k;
  670. #endif
  671. v4sf_t *rowC;
  672. v4sf_t result[4];
  673. __vector_quad acc0, acc1, acc2, acc3;
  674. BLASLONG l = 0;
  675. vec_t *rowA = (vec_t *) & AO[0];
  676. vec_t *rowB = (vec_t *) & BO[0];
  677. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  678. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  679. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  680. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  681. for (l = 1; l < temp; l++)
  682. {
  683. rowA = (vec_t *) & AO[l << 4];
  684. rowB = (vec_t *) & BO[l << 2];
  685. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  686. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  687. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  688. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  689. }
  690. SAVE_ACC (&acc0, 0);
  691. SAVE_ACC (&acc1, 4);
  692. CO += 8;
  693. SAVE_ACC (&acc2, 0);
  694. SAVE_ACC (&acc3, 4);
  695. CO += 8;
  696. AO += temp << 4;
  697. BO += temp << 2;
  698. #if defined(TRMMKERNEL)
  699. REFRESH_AFTER_SAVE (16, 4)
  700. #endif
  701. }
  702. if (m & 8)
  703. {
  704. FLOAT *BO;
  705. #if defined(TRMMKERNEL)
  706. REFRESH_POINTERS (8, 4);
  707. #else
  708. BO = B;
  709. temp = k;
  710. #endif
  711. v4sf_t *rowC;
  712. v4sf_t result[4];
  713. __vector_quad acc0, acc1;
  714. BLASLONG l = 0;
  715. vec_t *rowA = (vec_t *) & AO[0];
  716. vec_t *rowB = (vec_t *) & BO[0];
  717. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  718. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  719. for (l = 1; l < temp; l++)
  720. {
  721. rowA = (vec_t *) & AO[l << 3];
  722. rowB = (vec_t *) & BO[l << 2];
  723. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  724. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  725. }
  726. SAVE_ACC (&acc0, 0);
  727. SAVE_ACC (&acc1, 4);
  728. CO += 8;
  729. AO += temp << 3;
  730. BO += temp << 2;
  731. #if defined(TRMMKERNEL)
  732. REFRESH_AFTER_SAVE (8, 4)
  733. #endif
  734. }
  735. if (m & 4)
  736. {
  737. FLOAT *BO;
  738. #if defined(TRMMKERNEL)
  739. REFRESH_POINTERS (4, 4);
  740. #else
  741. BO = B;
  742. temp = k;
  743. #endif
  744. v4sf_t *rowC;
  745. __vector_quad acc0;
  746. v4sf_t result[4];
  747. BLASLONG l = 0;
  748. vec_t *rowA = (vec_t *) & AO[0];
  749. vec_t *rowB = (vec_t *) & BO[0];
  750. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  751. for (l = 1; l < temp; l++)
  752. {
  753. rowA = (vec_t *) & AO[l << 2];
  754. rowB = (vec_t *) & BO[l << 2];
  755. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  756. }
  757. SAVE_ACC (&acc0, 0);
  758. CO += 4;
  759. AO += temp << 2;
  760. BO += temp << 2;
  761. #if defined(TRMMKERNEL)
  762. REFRESH_AFTER_SAVE (4, 4)
  763. #endif
  764. }
  765. if (m & 2)
  766. {
  767. FLOAT *BO;
  768. #if defined(TRMMKERNEL)
  769. REFRESH_POINTERS (2, 4);
  770. #else
  771. BO = B;
  772. temp = k;
  773. #endif
  774. v2sf_t *rowC;
  775. v2sf_t result[8];
  776. __vector_quad acc0;
  777. BLASLONG l = 0;
  778. FLOAT t[4] = { 0 };
  779. t[0] = AO[0], t[1] = AO[1];
  780. vec_t *rowA = (vec_t *) & t[0];
  781. vec_t *rowB = (vec_t *) & BO[0];
  782. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  783. for (l = 1; l < temp; l++)
  784. {
  785. t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
  786. rowA = (vec_t *) & t[0];
  787. rowB = (vec_t *) & BO[l << 2];
  788. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  789. }
  790. SAVE4x2_ACC (&acc0, 0);
  791. CO += 2;
  792. AO += temp << 1;
  793. BO += temp << 2;
  794. #if defined(TRMMKERNEL)
  795. REFRESH_AFTER_SAVE (2, 4)
  796. #endif
  797. }
  798. if (m & 1)
  799. {
  800. FLOAT *BO;
  801. #if defined(TRMMKERNEL)
  802. REFRESH_POINTERS (1, 4)
  803. #else
  804. BO = B;
  805. temp = k;
  806. #endif
  807. BLASLONG l = 0;
  808. v4sf_t t = { 0, 0, 0, 0 };
  809. for (l = 0; l < temp; l++)
  810. {
  811. v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
  812. v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
  813. BO[(l << 2) + 3]
  814. };
  815. t += rowA * rowB;
  816. }
  817. t = t * valpha;
  818. #if defined(TRMMKERNEL)
  819. CO[0 * ldc] = t[0];
  820. CO[1 * ldc] = t[1];
  821. CO[2 * ldc] = t[2];
  822. CO[3 * ldc] = t[3];
  823. #else
  824. CO[0 * ldc] += t[0];
  825. CO[1 * ldc] += t[1];
  826. CO[2 * ldc] += t[2];
  827. CO[3 * ldc] += t[3];
  828. #endif
  829. CO += 1;
  830. AO += temp;
  831. BO += temp << 2;
  832. #if defined(TRMMKERNEL)
  833. REFRESH_AFTER_SAVE (1, 4)
  834. #endif
  835. }
  836. #if defined(TRMMKERNEL) && !defined(LEFT)
  837. off += 4; // number of values in A
  838. #endif
  839. B += k << 2;
  840. }
  841. if (n & 2)
  842. {
  843. BLASLONG i, j, temp;
  844. #if defined(TRMMKERNEL) && defined(LEFT)
  845. off = offset;
  846. #endif
  847. FLOAT *CO;
  848. FLOAT *AO;
  849. CO = C;
  850. C += ldc << 1;
  851. AO = A;
  852. #if !defined(TRMMKERNEL)
  853. i = m >> 5;
  854. for (j = 0; j < i; j++)
  855. {
  856. FLOAT *BO = B;
  857. v4sf_t *rowC;
  858. v4sf_t result[4];
  859. FLOAT *A1;
  860. A1 = AO + (16 * k);
  861. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  862. BLASLONG l = 0;
  863. FLOAT t[4] = { 0 };
  864. t[0] = BO[0], t[1] = BO[1];
  865. vec_t *rowB = (vec_t *) & t[0];
  866. vec_t *rowA = (vec_t *) & AO[0];
  867. vec_t *rowA1 = (vec_t *) & A1[0];
  868. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  869. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  870. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  871. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  872. __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
  873. __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
  874. __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
  875. __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
  876. for (l = 1; l < k; l++)
  877. {
  878. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  879. rowB = (vec_t *) & t[0];
  880. rowA = (vec_t *) & AO[l << 4];
  881. rowA1 = (vec_t *) & A1[l << 4];
  882. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  883. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  884. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  885. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  886. __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
  887. __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
  888. __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
  889. __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
  890. }
  891. SAVE2x4_ACC (&acc0, 0);
  892. SAVE2x4_ACC (&acc1, 4);
  893. SAVE2x4_ACC (&acc2, 8);
  894. SAVE2x4_ACC (&acc3, 12);
  895. CO += 16;
  896. SAVE2x4_ACC (&acc4, 0);
  897. SAVE2x4_ACC (&acc5, 4);
  898. SAVE2x4_ACC (&acc6, 8);
  899. SAVE2x4_ACC (&acc7, 12);
  900. CO += 16;
  901. AO += k << 5;
  902. BO += k << 1;
  903. }
  904. i = (m & 31) >> 4;
  905. #else
  906. i = m >> 4;
  907. #endif
  908. for (j = 0; j < i; j++)
  909. {
  910. FLOAT *BO;
  911. v4sf_t *rowC;
  912. v4sf_t result[4];
  913. __vector_quad acc0, acc1, acc2, acc3;
  914. BLASLONG l = 0;
  915. #if defined(TRMMKERNEL)
  916. REFRESH_POINTERS (16, 2)
  917. #else
  918. BO = B;
  919. temp = k;
  920. #endif
  921. FLOAT t[4] = { 0 };
  922. t[0] = BO[0], t[1] = BO[1];
  923. vec_t *rowB = (vec_t *) & t[0];
  924. vec_t *rowA = (vec_t *) & AO[0];
  925. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  926. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  927. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  928. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  929. for (l = 1; l < temp; l++)
  930. {
  931. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  932. rowB = (vec_t *) & t[0];
  933. rowA = (vec_t *) & AO[l << 4];
  934. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  935. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  936. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  937. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  938. }
  939. SAVE2x4_ACC (&acc0, 0);
  940. SAVE2x4_ACC (&acc1, 4);
  941. SAVE2x4_ACC (&acc2, 8);
  942. SAVE2x4_ACC (&acc3, 12);
  943. CO += 16;
  944. AO += temp << 4;
  945. BO += temp << 1;
  946. #if defined(TRMMKERNEL)
  947. REFRESH_AFTER_SAVE (16, 2)
  948. #endif
  949. }
  950. if (m & 8)
  951. {
  952. FLOAT *BO;
  953. v4sf_t *rowC;
  954. v4sf_t result[4];
  955. __vector_quad acc0, acc1;
  956. #if defined(TRMMKERNEL)
  957. REFRESH_POINTERS (8, 2)
  958. #else
  959. BO = B;
  960. temp = k;
  961. #endif
  962. BLASLONG l = 0;
  963. FLOAT t[4] = { 0 };
  964. t[0] = BO[0], t[1] = BO[1];
  965. vec_t *rowB = (vec_t *) & t[0];
  966. vec_t *rowA = (vec_t *) & AO[0];
  967. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  968. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  969. for (l = 1; l < temp; l++)
  970. {
  971. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  972. rowB = (vec_t *) & t[0];
  973. rowA = (vec_t *) & AO[l << 3];
  974. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  975. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  976. }
  977. SAVE2x4_ACC (&acc0, 0);
  978. SAVE2x4_ACC (&acc1, 4);
  979. CO += 8;
  980. AO += temp << 3;
  981. BO += temp << 1;
  982. #if defined(TRMMKERNEL)
  983. REFRESH_AFTER_SAVE (8, 2)
  984. #endif
  985. }
  986. if (m & 4)
  987. {
  988. FLOAT *BO;
  989. v4sf_t *rowC;
  990. v4sf_t result[4];
  991. __vector_quad acc0;
  992. #if defined(TRMMKERNEL)
  993. REFRESH_POINTERS (4, 2)
  994. #else
  995. BO = B;
  996. temp = k;
  997. #endif
  998. BLASLONG l = 0;
  999. FLOAT t[4] = { 0 };
  1000. t[0] = BO[0], t[1] = BO[1];
  1001. vec_t *rowB = (vec_t *) & t[0];
  1002. vec_t *rowA = (vec_t *) & AO[0];
  1003. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  1004. for (l = 1; l < temp; l++)
  1005. {
  1006. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  1007. rowB = (vec_t *) & t[0];
  1008. rowA = (vec_t *) & AO[l << 2];
  1009. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  1010. }
  1011. SAVE2x4_ACC (&acc0, 0);
  1012. CO += 4;
  1013. AO += temp << 2;
  1014. BO += temp << 1;
  1015. #if defined(TRMMKERNEL)
  1016. REFRESH_AFTER_SAVE (4, 2)
  1017. #endif
  1018. }
  1019. if (m & 2)
  1020. {
  1021. FLOAT *BO;
  1022. BLASLONG l = 0;
  1023. #if defined(TRMMKERNEL)
  1024. REFRESH_POINTERS (2, 2)
  1025. #else
  1026. BO = B;
  1027. temp = k;
  1028. #endif
  1029. v4sf_t t = { 0, 0, 0, 0 };
  1030. for (l = 0; l < (temp << 1); l += 2)
  1031. {
  1032. v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
  1033. v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
  1034. t += rowA * rowB;
  1035. }
  1036. t = t * valpha;
  1037. #if defined(TRMMKERNEL)
  1038. CO[0 * ldc] = t[0];
  1039. CO[1 * ldc] = t[1];
  1040. CO[0 * ldc + 1] = t[2];
  1041. CO[1 * ldc + 1] = t[3];
  1042. #else
  1043. CO[0 * ldc] += t[0];
  1044. CO[1 * ldc] += t[1];
  1045. CO[0 * ldc + 1] += t[2];
  1046. CO[1 * ldc + 1] += t[3];
  1047. #endif
  1048. CO += 2;
  1049. AO += temp << 1;
  1050. BO += temp << 1;
  1051. #if defined(TRMMKERNEL)
  1052. REFRESH_AFTER_SAVE (2, 2)
  1053. #endif
  1054. }
  1055. if (m & 1)
  1056. {
  1057. FLOAT *BO;
  1058. BLASLONG l = 0;
  1059. #if defined(TRMMKERNEL)
  1060. REFRESH_POINTERS (1, 2)
  1061. #else
  1062. BO = B;
  1063. temp = k;
  1064. #endif
  1065. v4sf_t t = { 0, 0, 0, 0 };
  1066. for (l = 0; l < temp; l++)
  1067. {
  1068. v4sf_t rowA = { AO[l], AO[l], 0, 0 };
  1069. v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
  1070. t += rowA * rowB;
  1071. }
  1072. t = t * valpha;
  1073. #if defined(TRMMKERNEL)
  1074. CO[0 * ldc] = t[0];
  1075. CO[1 * ldc] = t[1];
  1076. #else
  1077. CO[0 * ldc] += t[0];
  1078. CO[1 * ldc] += t[1];
  1079. #endif
  1080. CO += 1;
  1081. AO += temp;
  1082. BO += temp << 1;
  1083. #if defined(TRMMKERNEL)
  1084. REFRESH_AFTER_SAVE (1, 2)
  1085. #endif
  1086. }
  1087. #if defined(TRMMKERNEL) && !defined(LEFT)
  1088. off += 2; // number of values in A
  1089. #endif
  1090. B += k << 1;
  1091. }
  1092. if (n & 1)
  1093. {
  1094. BLASLONG i, temp;
  1095. #if defined(TRMMKERNEL) && defined(LEFT)
  1096. off = offset;
  1097. #endif
  1098. FLOAT *CO;
  1099. FLOAT *AO;
  1100. CO = C;
  1101. C += ldc;
  1102. AO = A;
  1103. for (i = 0; i < (m >> 4); i++)
  1104. {
  1105. FLOAT *BO;
  1106. BLASLONG l = 0;
  1107. #if defined(TRMMKERNEL)
  1108. REFRESH_POINTERS (16, 1)
  1109. #else
  1110. BO = B;
  1111. temp = k;
  1112. #endif
  1113. v4sf_t t = { 0, 0, 0, 0 };
  1114. v4sf_t t1 = { 0, 0, 0, 0 };
  1115. v4sf_t t2 = { 0, 0, 0, 0 };
  1116. v4sf_t t3 = { 0, 0, 0, 0 };
  1117. for (l = 0; l < temp; l++)
  1118. {
  1119. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1120. v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
  1121. AO[(l << 4) + 3]
  1122. };
  1123. v4sf_t rowA1 =
  1124. { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
  1125. AO[(l << 4) + 7]
  1126. };
  1127. v4sf_t rowA2 =
  1128. { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
  1129. AO[(l << 4) + 11]
  1130. };
  1131. v4sf_t rowA3 =
  1132. { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
  1133. AO[(l << 4) + 15]
  1134. };
  1135. t += rowA * rowB;
  1136. t1 += rowA1 * rowB;
  1137. t2 += rowA2 * rowB;
  1138. t3 += rowA3 * rowB;
  1139. }
  1140. t = t * valpha;
  1141. t1 = t1 * valpha;
  1142. t2 = t2 * valpha;
  1143. t3 = t3 * valpha;
  1144. #if defined(TRMMKERNEL)
  1145. CO[0] = t[0];
  1146. CO[1] = t[1];
  1147. CO[2] = t[2];
  1148. CO[3] = t[3];
  1149. CO[4] = t1[0];
  1150. CO[5] = t1[1];
  1151. CO[6] = t1[2];
  1152. CO[7] = t1[3];
  1153. CO[8] = t2[0];
  1154. CO[9] = t2[1];
  1155. CO[10] = t2[2];
  1156. CO[11] = t2[3];
  1157. CO[12] = t3[0];
  1158. CO[13] = t3[1];
  1159. CO[14] = t3[2];
  1160. CO[15] = t3[3];
  1161. #else
  1162. CO[0] += t[0];
  1163. CO[1] += t[1];
  1164. CO[2] += t[2];
  1165. CO[3] += t[3];
  1166. CO[4] += t1[0];
  1167. CO[5] += t1[1];
  1168. CO[6] += t1[2];
  1169. CO[7] += t1[3];
  1170. CO[8] += t2[0];
  1171. CO[9] += t2[1];
  1172. CO[10] += t2[2];
  1173. CO[11] += t2[3];
  1174. CO[12] += t3[0];
  1175. CO[13] += t3[1];
  1176. CO[14] += t3[2];
  1177. CO[15] += t3[3];
  1178. #endif
  1179. AO += temp << 4;
  1180. BO += temp;
  1181. CO += 16;
  1182. #if defined(TRMMKERNEL)
  1183. REFRESH_AFTER_SAVE (16, 1)
  1184. #endif
  1185. }
  1186. if (m & 8)
  1187. {
  1188. FLOAT *BO;
  1189. BLASLONG l = 0;
  1190. v4sf_t t = { 0, 0, 0, 0 };
  1191. v4sf_t t1 = { 0, 0, 0, 0 };
  1192. #if defined(TRMMKERNEL)
  1193. REFRESH_POINTERS (8, 1)
  1194. #else
  1195. BO = B;
  1196. temp = k;
  1197. #endif
  1198. for (l = 0; l < temp; l++)
  1199. {
  1200. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1201. v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
  1202. AO[(l << 3) + 3]
  1203. };
  1204. v4sf_t rowA1 =
  1205. { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
  1206. AO[(l << 3) + 7]
  1207. };
  1208. t += rowA * rowB;
  1209. t1 += rowA1 * rowB;
  1210. }
  1211. t = t * valpha;
  1212. t1 = t1 * valpha;
  1213. #if defined(TRMMKERNEL)
  1214. CO[0] = t[0];
  1215. CO[1] = t[1];
  1216. CO[2] = t[2];
  1217. CO[3] = t[3];
  1218. CO[4] = t1[0];
  1219. CO[5] = t1[1];
  1220. CO[6] = t1[2];
  1221. CO[7] = t1[3];
  1222. #else
  1223. CO[0] += t[0];
  1224. CO[1] += t[1];
  1225. CO[2] += t[2];
  1226. CO[3] += t[3];
  1227. CO[4] += t1[0];
  1228. CO[5] += t1[1];
  1229. CO[6] += t1[2];
  1230. CO[7] += t1[3];
  1231. #endif
  1232. AO += temp << 3;
  1233. BO += temp;
  1234. CO += 8;
  1235. #if defined(TRMMKERNEL)
  1236. REFRESH_AFTER_SAVE (8, 1)
  1237. #endif
  1238. }
  1239. if (m & 4)
  1240. {
  1241. FLOAT *BO;
  1242. BLASLONG l = 0;
  1243. v4sf_t t = { 0, 0, 0, 0 };
  1244. #if defined(TRMMKERNEL)
  1245. REFRESH_POINTERS (4, 1)
  1246. #else
  1247. BO = B;
  1248. temp = k;
  1249. #endif
  1250. for (l = 0; l < temp; l++)
  1251. {
  1252. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1253. v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
  1254. AO[(l << 2) + 3]
  1255. };
  1256. t += rowA * rowB;
  1257. }
  1258. t = t * valpha;
  1259. #if defined(TRMMKERNEL)
  1260. CO[0] = t[0];
  1261. CO[1] = t[1];
  1262. CO[2] = t[2];
  1263. CO[3] = t[3];
  1264. #else
  1265. CO[0] += t[0];
  1266. CO[1] += t[1];
  1267. CO[2] += t[2];
  1268. CO[3] += t[3];
  1269. #endif
  1270. AO += temp << 2;
  1271. BO += temp;
  1272. CO += 4;
  1273. #if defined(TRMMKERNEL)
  1274. REFRESH_AFTER_SAVE (4, 1)
  1275. #endif
  1276. }
  1277. if (m & 2)
  1278. {
  1279. FLOAT *BO;
  1280. BLASLONG l = 0;
  1281. #if defined(TRMMKERNEL)
  1282. REFRESH_POINTERS (2, 1)
  1283. #else
  1284. BO = B;
  1285. temp = k;
  1286. #endif
  1287. v4sf_t t = { 0, 0, 0, 0 };
  1288. for (l = 0; l < temp; l++)
  1289. {
  1290. v4sf_t rowB = { BO[l], BO[l], 0, 0 };
  1291. v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
  1292. t += rowA * rowB;
  1293. }
  1294. t = t * valpha;
  1295. #if defined(TRMMKERNEL)
  1296. CO[0] = t[0];
  1297. CO[1] = t[1];
  1298. #else
  1299. CO[0] += t[0];
  1300. CO[1] += t[1];
  1301. #endif
  1302. AO += temp << 1;
  1303. BO += temp;
  1304. CO += 2;
  1305. #if defined(TRMMKERNEL)
  1306. REFRESH_AFTER_SAVE (2, 1)
  1307. #endif
  1308. }
  1309. if (m & 1)
  1310. {
  1311. FLOAT *BO;
  1312. #if defined(TRMMKERNEL)
  1313. REFRESH_POINTERS (1, 1)
  1314. #else
  1315. BO = B;
  1316. temp = k;
  1317. #endif
  1318. BLASLONG l = 0;
  1319. FLOAT t = 0;
  1320. for (l = 0; l < temp; l++)
  1321. {
  1322. t += AO[l] * BO[l];
  1323. }
  1324. AO += temp;
  1325. BO += temp;
  1326. #if defined(TRMMKERNEL)
  1327. CO[0] = t * alpha;
  1328. #else
  1329. CO[0] += t * alpha;
  1330. #endif
  1331. CO += 1;
  1332. #if defined(TRMMKERNEL)
  1333. REFRESH_AFTER_SAVE (1, 1)
  1334. #endif
  1335. }
  1336. #if defined(TRMMKERNEL) && !defined(LEFT)
  1337. off += 1; // number of values in A
  1338. #endif
  1339. B += k;
  1340. }
  1341. return 0;
  1342. }