You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_power10.c 35 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362
  1. /*********************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #include "common.h"
  28. #include <altivec.h>
  29. typedef __vector unsigned char vec_t;
  30. typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
  31. typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
  32. #if defined(TRMMKERNEL)
  33. #define SAVE_ACC(ACC, J) \
  34. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  35. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  36. rowC[0] = result[0] * alpha; \
  37. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  38. rowC[0] = result[1] * alpha; \
  39. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  40. rowC[0] = result[2] * alpha; \
  41. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  42. rowC[0] = result[3] * alpha;
  43. #define SAVE_ACC1(ACC, J) \
  44. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  45. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  46. rowC[0] = result[0] * alpha; \
  47. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  48. rowC[0] = result[1] * alpha; \
  49. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  50. rowC[0] = result[2] * alpha; \
  51. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  52. rowC[0] = result[3] * alpha;
  53. #define SAVE4x2_ACC(ACC, J) \
  54. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  55. rowC = (v2sf_t *) &CO[0* ldc+J]; \
  56. rowC[0] = result[0] * alpha; \
  57. rowC = (v2sf_t *) &CO[1* ldc+J]; \
  58. rowC[0] = result[2] * alpha; \
  59. rowC = (v2sf_t *) &CO[2* ldc+J]; \
  60. rowC[0] = result[4] * alpha; \
  61. rowC = (v2sf_t *) &CO[3* ldc+J]; \
  62. rowC[0] = result[6] * alpha;
  63. #define SAVE4x2_ACC1(ACC, J) \
  64. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  65. rowC = (v2sf_t *) &CO[4* ldc+J]; \
  66. rowC[0] = result[0] * alpha; \
  67. rowC = (v2sf_t *) &CO[5* ldc+J]; \
  68. rowC[0] = result[2] * alpha; \
  69. rowC = (v2sf_t *) &CO[6* ldc+J]; \
  70. rowC[0] = result[4] * alpha; \
  71. rowC = (v2sf_t *) &CO[7* ldc+J]; \
  72. rowC[0] = result[6] * alpha;
  73. #define SAVE2x4_ACC(ACC, J) \
  74. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  75. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  76. rowC[0] = result[0] * alpha; \
  77. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  78. rowC[0] = result[1] * alpha;
  79. #else
  80. #define SAVE_ACC(ACC, J) \
  81. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  82. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  83. rowC[0] += result[0] * alpha; \
  84. rowC = (v4sf_t *) &CO[1*ldc+J]; \
  85. rowC[0] += result[1] * alpha; \
  86. rowC = (v4sf_t *) &CO[2*ldc+J]; \
  87. rowC[0] += result[2] * alpha; \
  88. rowC = (v4sf_t *) &CO[3*ldc+J]; \
  89. rowC[0] += result[3] * alpha;
  90. #define SAVE_ACC1(ACC, J) \
  91. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  92. rowC = (v4sf_t *) &CO[4* ldc+J]; \
  93. rowC[0] += result[0] * alpha; \
  94. rowC = (v4sf_t *) &CO[5*ldc+J]; \
  95. rowC[0] += result[1] * alpha; \
  96. rowC = (v4sf_t *) &CO[6*ldc+J]; \
  97. rowC[0] += result[2] * alpha; \
  98. rowC = (v4sf_t *) &CO[7*ldc+J]; \
  99. rowC[0] += result[3] * alpha;
  100. #define SAVE4x2_ACC(ACC, J) \
  101. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  102. rowC = (v2sf_t *) &CO[0* ldc+J]; \
  103. rowC[0] += result[0] * alpha; \
  104. rowC = (v2sf_t *) &CO[1* ldc+J]; \
  105. rowC[0] += result[2] * alpha; \
  106. rowC = (v2sf_t *) &CO[2* ldc+J]; \
  107. rowC[0] += result[4] * alpha; \
  108. rowC = (v2sf_t *) &CO[3* ldc+J]; \
  109. rowC[0] += result[6] * alpha;
  110. #define SAVE4x2_ACC1(ACC, J) \
  111. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  112. rowC = (v2sf_t *) &CO[4* ldc+J]; \
  113. rowC[0] += result[0] * alpha; \
  114. rowC = (v2sf_t *) &CO[5* ldc+J]; \
  115. rowC[0] += result[2] * alpha; \
  116. rowC = (v2sf_t *) &CO[6* ldc+J]; \
  117. rowC[0] += result[4] * alpha; \
  118. rowC = (v2sf_t *) &CO[7* ldc+J]; \
  119. rowC[0] += result[6] * alpha;
  120. #define SAVE2x4_ACC(ACC, J) \
  121. __builtin_mma_disassemble_acc ((void *)result, ACC); \
  122. rowC = (v4sf_t *) &CO[0* ldc+J]; \
  123. rowC[0] += result[0] * alpha; \
  124. rowC = (v4sf_t *) &CO[1* ldc+J]; \
  125. rowC[0] += result[1] * alpha;
  126. #endif
  127. #define KERNEL(i, j) \
  128. __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
  129. __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
  130. __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
  131. __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
  132. __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
  133. __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
  134. __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
  135. __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
  136. #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
  137. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  138. #define REFRESH_TEMP_BK(x, y) \
  139. temp = k - off;
  140. #elif defined(LEFT)
  141. #define REFRESH_TEMP_BK(x, y) \
  142. temp = off + x;
  143. #else
  144. #define REFRESH_TEMP_BK(x, y) \
  145. temp = off + y;
  146. #endif
  147. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  148. #define REFRESH_POINTERS(x, y) \
  149. BO = B; \
  150. REFRESH_TEMP_BK(x, y)
  151. #else
  152. #define REFRESH_POINTERS(x, y) \
  153. AO += off * x; \
  154. BO = B + off * y; \
  155. REFRESH_TEMP_BK(x, y)
  156. #endif
  157. #ifdef LEFT
  158. #define REFRESH_OFF(x) \
  159. off += x;
  160. #else
  161. #define REFRESH_OFF(x)
  162. #endif
  163. #ifdef LEFT
  164. #define UPDATE_TEMP(x, y) \
  165. temp -= x;
  166. #else
  167. #define UPDATE_TEMP(x, y) \
  168. temp -= y;
  169. #endif
  170. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  171. #define REFRESH_TMP_AFTER_SAVE(x, y) \
  172. temp = k - off; \
  173. UPDATE_TEMP(x, y) \
  174. AO += temp * x; \
  175. BO += temp * y;
  176. #else
  177. #define REFRESH_TMP_AFTER_SAVE(x, y)
  178. #endif
  179. #define REFRESH_AFTER_SAVE(x,y) \
  180. REFRESH_TMP_AFTER_SAVE(x, y) \
  181. REFRESH_OFF(x)
  182. /*************************************************************************************
  183. * GEMM Kernel
  184. *************************************************************************************/
  185. int
  186. CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
  187. FLOAT * C, BLASLONG ldc
  188. #ifdef TRMMKERNEL
  189. , BLASLONG offset
  190. #endif
  191. )
  192. {
  193. BLASLONG i1;
  194. #if defined(TRMMKERNEL)
  195. BLASLONG off;
  196. #endif
  197. #if defined(TRMMKERNEL) && !defined(LEFT)
  198. off = -offset;
  199. #endif
  200. v4sf_t valpha = { alpha, alpha, alpha, alpha };
  201. for (i1 = 0; i1 < (n >> 3); i1++)
  202. {
  203. BLASLONG j, temp;
  204. FLOAT *CO;
  205. FLOAT *AO;
  206. #if defined(TRMMKERNEL) && defined(LEFT)
  207. off = offset;
  208. #endif
  209. CO = C;
  210. C += ldc << 3;
  211. AO = A;
  212. PREFETCH1 (A, 128);
  213. PREFETCH1 (A, 256);
  214. for (j = 0; j < (m >> 4); j++)
  215. {
  216. FLOAT *BO;
  217. #if defined(TRMMKERNEL)
  218. REFRESH_POINTERS (16, 8);
  219. #else
  220. BO = B;
  221. temp = k;
  222. #endif
  223. v4sf_t *rowC;
  224. v4sf_t result[4];
  225. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  226. BLASLONG l = 0;
  227. vec_t *rowA1 = (vec_t *) & AO[0];
  228. vec_t *rowB1 = (vec_t *) & BO[0];
  229. __builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]);
  230. __builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]);
  231. __builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]);
  232. __builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]);
  233. __builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]);
  234. __builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]);
  235. __builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]);
  236. __builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]);
  237. AO += 16;
  238. BO += 8;
  239. temp--;
  240. BLASLONG K = temp / 64;
  241. for (l = 0; l < K; l++)
  242. {
  243. vec_t *rowA = (vec_t *) & AO[0];
  244. vec_t *rowB = (vec_t *) & BO[0];
  245. KERNEL (0, 0);
  246. KERNEL (2, 4);
  247. KERNEL (4, 8);
  248. KERNEL (6, 12);
  249. KERNEL (8, 16);
  250. KERNEL (10, 20);
  251. KERNEL (12, 24);
  252. KERNEL (14, 28);
  253. KERNEL (16, 32);
  254. KERNEL (18, 36);
  255. KERNEL (20, 40);
  256. KERNEL (22, 44);
  257. KERNEL (24, 48);
  258. KERNEL (26, 52);
  259. KERNEL (28, 56);
  260. KERNEL (30, 60);
  261. KERNEL (32, 64);
  262. KERNEL (34, 68);
  263. KERNEL (36, 72);
  264. KERNEL (38, 76);
  265. KERNEL (40, 80);
  266. KERNEL (42, 84);
  267. KERNEL (44, 88);
  268. KERNEL (46, 92);
  269. KERNEL (48, 96);
  270. KERNEL (50, 100);
  271. KERNEL (52, 104);
  272. KERNEL (54, 108);
  273. KERNEL (56, 112);
  274. KERNEL (58, 116);
  275. KERNEL (60, 120);
  276. KERNEL (62, 124);
  277. KERNEL (64, 128);
  278. KERNEL (66, 132);
  279. KERNEL (68, 136);
  280. KERNEL (70, 140);
  281. KERNEL (72, 144);
  282. KERNEL (74, 148);
  283. KERNEL (76, 152);
  284. KERNEL (78, 156);
  285. KERNEL (80, 160);
  286. KERNEL (82, 164);
  287. KERNEL (84, 168);
  288. KERNEL (86, 172);
  289. KERNEL (88, 176);
  290. KERNEL (90, 180);
  291. KERNEL (92, 184);
  292. KERNEL (94, 188);
  293. KERNEL (96, 192);
  294. KERNEL (98, 196);
  295. KERNEL (100, 200);
  296. KERNEL (102, 204);
  297. KERNEL (104, 208);
  298. KERNEL (106, 212);
  299. KERNEL (108, 216);
  300. KERNEL (110, 220);
  301. KERNEL (112, 224);
  302. KERNEL (114, 228);
  303. KERNEL (116, 232);
  304. KERNEL (118, 236);
  305. KERNEL (120, 240);
  306. KERNEL (122, 244);
  307. KERNEL (124, 248);
  308. KERNEL (126, 252);
  309. AO += 1024;
  310. BO += 512;
  311. }
  312. if ((temp & 63) >> 5)
  313. {
  314. vec_t *rowA = (vec_t *) & AO[0];
  315. vec_t *rowB = (vec_t *) & BO[0];
  316. KERNEL (0, 0);
  317. KERNEL (2, 4);
  318. KERNEL (4, 8);
  319. KERNEL (6, 12);
  320. KERNEL (8, 16);
  321. KERNEL (10, 20);
  322. KERNEL (12, 24);
  323. KERNEL (14, 28);
  324. KERNEL (16, 32);
  325. KERNEL (18, 36);
  326. KERNEL (20, 40);
  327. KERNEL (22, 44);
  328. KERNEL (24, 48);
  329. KERNEL (26, 52);
  330. KERNEL (28, 56);
  331. KERNEL (30, 60);
  332. KERNEL (32, 64);
  333. KERNEL (34, 68);
  334. KERNEL (36, 72);
  335. KERNEL (38, 76);
  336. KERNEL (40, 80);
  337. KERNEL (42, 84);
  338. KERNEL (44, 88);
  339. KERNEL (46, 92);
  340. KERNEL (48, 96);
  341. KERNEL (50, 100);
  342. KERNEL (52, 104);
  343. KERNEL (54, 108);
  344. KERNEL (56, 112);
  345. KERNEL (58, 116);
  346. KERNEL (60, 120);
  347. KERNEL (62, 124);
  348. AO += 512;
  349. BO += 256;
  350. }
  351. if ((temp & 31) >> 4)
  352. {
  353. vec_t *rowA = (vec_t *) & AO[0];
  354. vec_t *rowB = (vec_t *) & BO[0];
  355. KERNEL (0, 0);
  356. KERNEL (2, 4);
  357. KERNEL (4, 8);
  358. KERNEL (6, 12);
  359. KERNEL (8, 16);
  360. KERNEL (10, 20);
  361. KERNEL (12, 24);
  362. KERNEL (14, 28);
  363. KERNEL (16, 32);
  364. KERNEL (18, 36);
  365. KERNEL (20, 40);
  366. KERNEL (22, 44);
  367. KERNEL (24, 48);
  368. KERNEL (26, 52);
  369. KERNEL (28, 56);
  370. KERNEL (30, 60);
  371. AO += 256;
  372. BO += 128;
  373. }
  374. if ((temp & 15) >> 3)
  375. {
  376. vec_t *rowA = (vec_t *) & AO[0];
  377. vec_t *rowB = (vec_t *) & BO[0];
  378. KERNEL (0, 0);
  379. KERNEL (2, 4);
  380. KERNEL (4, 8);
  381. KERNEL (6, 12);
  382. KERNEL (8, 16);
  383. KERNEL (10, 20);
  384. KERNEL (12, 24);
  385. KERNEL (14, 28);
  386. AO += 128;
  387. BO += 64;
  388. }
  389. if ((temp & 7) >> 2)
  390. {
  391. vec_t *rowA = (vec_t *) & AO[0];
  392. vec_t *rowB = (vec_t *) & BO[0];
  393. KERNEL (0, 0);
  394. KERNEL (2, 4);
  395. KERNEL (4, 8);
  396. KERNEL (6, 12);
  397. AO += 64;
  398. BO += 32;
  399. }
  400. if ((temp & 3) >> 1)
  401. {
  402. vec_t *rowA = (vec_t *) & AO[0];
  403. vec_t *rowB = (vec_t *) & BO[0];
  404. KERNEL (0, 0);
  405. KERNEL (2, 4);
  406. AO += 32;
  407. BO += 16;
  408. }
  409. if ((temp & 1) >> 0)
  410. {
  411. vec_t *rowA = (vec_t *) & AO[0];
  412. vec_t *rowB = (vec_t *) & BO[0];
  413. KERNEL (0, 0);
  414. AO += 16;
  415. BO += 8;
  416. }
  417. SAVE_ACC (&acc0, 0);
  418. SAVE_ACC (&acc2, 4);
  419. SAVE_ACC1 (&acc1, 0);
  420. SAVE_ACC1 (&acc3, 4);
  421. SAVE_ACC (&acc4, 8);
  422. SAVE_ACC (&acc6, 12);
  423. SAVE_ACC1 (&acc5, 8);
  424. SAVE_ACC1 (&acc7, 12);
  425. #if defined(TRMMKERNEL)
  426. REFRESH_AFTER_SAVE (16, 8)
  427. #endif
  428. CO += 16;
  429. }
  430. if (m & 8)
  431. {
  432. FLOAT *BO;
  433. #if defined(TRMMKERNEL)
  434. REFRESH_POINTERS (8, 8);
  435. #else
  436. BO = B;
  437. temp = k;
  438. #endif
  439. v4sf_t *rowC;
  440. v4sf_t result[4];
  441. __vector_quad acc0, acc1, acc2, acc3;
  442. BLASLONG l = 0;
  443. vec_t *rowA = (vec_t *) & AO[0];
  444. vec_t *rowB = (vec_t *) & BO[0];
  445. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  446. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  447. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]);
  448. __builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]);
  449. for (l = 1; l < temp; l++)
  450. {
  451. rowA = (vec_t *) & AO[l << 3];
  452. rowB = (vec_t *) & BO[l << 3];
  453. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  454. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  455. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
  456. __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
  457. }
  458. SAVE_ACC (&acc0, 0);
  459. SAVE_ACC (&acc2, 4);
  460. SAVE_ACC1 (&acc1, 0);
  461. SAVE_ACC1 (&acc3, 4);
  462. AO += (temp << 3);
  463. BO += (temp << 3);
  464. CO += 8;
  465. #if defined(TRMMKERNEL)
  466. REFRESH_AFTER_SAVE (8, 8)
  467. #endif
  468. }
  469. if (m & 4)
  470. {
  471. FLOAT *BO;
  472. #if defined(TRMMKERNEL)
  473. REFRESH_POINTERS (4, 8);
  474. #else
  475. BO = B;
  476. temp = k;
  477. #endif
  478. v4sf_t *rowC;
  479. v4sf_t result[4];
  480. __vector_quad acc0, acc1;
  481. BLASLONG l = 0;
  482. vec_t *rowA = (vec_t *) & AO[0];
  483. vec_t *rowB = (vec_t *) & BO[0];
  484. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  485. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  486. for (l = 1; l < temp; l++)
  487. {
  488. rowA = (vec_t *) & AO[l << 2];
  489. rowB = (vec_t *) & BO[l << 3];
  490. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  491. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  492. }
  493. SAVE_ACC (&acc0, 0);
  494. SAVE_ACC1 (&acc1, 0);
  495. CO += 4;
  496. AO += (temp << 2);
  497. BO += (temp << 3);
  498. #if defined(TRMMKERNEL)
  499. REFRESH_AFTER_SAVE (4, 8)
  500. #endif
  501. }
  502. if (m & 2)
  503. {
  504. FLOAT *BO;
  505. #if defined(TRMMKERNEL)
  506. REFRESH_POINTERS (2, 8);
  507. #else
  508. BO = B;
  509. temp = k;
  510. #endif
  511. v2sf_t *rowC;
  512. v2sf_t result[8];
  513. __vector_quad acc0, acc1;
  514. BLASLONG l = 0;
  515. FLOAT t[4] = { 0 };
  516. t[0] = AO[0], t[1] = AO[1];
  517. vec_t *rowA = (vec_t *) & t[0];
  518. vec_t *rowB = (vec_t *) & BO[0];
  519. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  520. __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
  521. for (l = 1; l < temp; l++)
  522. {
  523. t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
  524. rowA = (vec_t *) & t[0];
  525. rowB = (vec_t *) & BO[l << 3];
  526. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  527. __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
  528. }
  529. SAVE4x2_ACC (&acc0, 0);
  530. SAVE4x2_ACC1 (&acc1, 0);
  531. CO += 2;
  532. AO += (temp << 1);
  533. BO += (temp << 3);
  534. #if defined(TRMMKERNEL)
  535. REFRESH_AFTER_SAVE (2, 8)
  536. #endif
  537. }
  538. if (m & 1)
  539. {
  540. FLOAT *BO;
  541. #if defined(TRMMKERNEL)
  542. REFRESH_POINTERS (1, 8);
  543. #else
  544. BO = B;
  545. temp = k;
  546. #endif
  547. BLASLONG l = 0;
  548. v4sf_t t = { 0, 0, 0, 0 };
  549. v4sf_t t1 = { 0, 0, 0, 0 };
  550. for (l = 0; l < temp; l++)
  551. {
  552. v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
  553. v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
  554. BO[(l << 3) + 3]
  555. };
  556. v4sf_t rowB1 =
  557. { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
  558. BO[(l << 3) + 7]
  559. };
  560. t += rowA * rowB;
  561. t1 += rowA * rowB1;
  562. }
  563. t = t * valpha;
  564. t1 = t1 * valpha;
  565. #if defined(TRMMKERNEL)
  566. CO[0 * ldc] = t[0];
  567. CO[1 * ldc] = t[1];
  568. CO[2 * ldc] = t[2];
  569. CO[3 * ldc] = t[3];
  570. CO[4 * ldc] = t1[0];
  571. CO[5 * ldc] = t1[1];
  572. CO[6 * ldc] = t1[2];
  573. CO[7 * ldc] = t1[3];
  574. #else
  575. CO[0 * ldc] += t[0];
  576. CO[1 * ldc] += t[1];
  577. CO[2 * ldc] += t[2];
  578. CO[3 * ldc] += t[3];
  579. CO[4 * ldc] += t1[0];
  580. CO[5 * ldc] += t1[1];
  581. CO[6 * ldc] += t1[2];
  582. CO[7 * ldc] += t1[3];
  583. #endif
  584. CO += 1;
  585. AO += temp;
  586. BO += (temp << 3);
  587. #if defined(TRMMKERNEL)
  588. REFRESH_AFTER_SAVE (1, 8)
  589. #endif
  590. }
  591. #if defined(TRMMKERNEL) && !defined(LEFT)
  592. off += 8; // number of values in A
  593. #endif
  594. B += k << 3;
  595. }
  596. if (n & 4)
  597. {
  598. BLASLONG i, j, temp;
  599. #if defined(TRMMKERNEL) && defined(LEFT)
  600. off = offset;
  601. #endif
  602. FLOAT *CO;
  603. FLOAT *AO;
  604. CO = C;
  605. C += ldc << 2;
  606. AO = A;
  607. #if !defined(TRMMKERNEL)
  608. i = m >> 5;
  609. for (j = 0; j < i; j++)
  610. {
  611. FLOAT *BO = B;
  612. v4sf_t *rowC;
  613. v4sf_t result[4];
  614. FLOAT *A1;
  615. A1 = AO + (16 * k);
  616. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  617. BLASLONG l = 0;
  618. vec_t *rowA = (vec_t *) & AO[0];
  619. vec_t *rowA1 = (vec_t *) & A1[0];
  620. vec_t *rowB = (vec_t *) & BO[0];
  621. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  622. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  623. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  624. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  625. __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
  626. __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
  627. __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
  628. __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
  629. for (l = 1; l < k; l++)
  630. {
  631. rowA = (vec_t *) & AO[l << 4];
  632. rowA1 = (vec_t *) & A1[l << 4];
  633. rowB = (vec_t *) & BO[l << 2];
  634. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  635. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  636. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  637. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  638. __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
  639. __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
  640. __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
  641. __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
  642. }
  643. SAVE_ACC (&acc0, 0);
  644. SAVE_ACC (&acc1, 4);
  645. CO += 8;
  646. SAVE_ACC (&acc2, 0);
  647. SAVE_ACC (&acc3, 4);
  648. CO += 8;
  649. SAVE_ACC (&acc4, 0);
  650. SAVE_ACC (&acc5, 4);
  651. CO += 8;
  652. SAVE_ACC (&acc6, 0);
  653. SAVE_ACC (&acc7, 4);
  654. CO += 8;
  655. AO += k << 5;
  656. BO += k << 2;
  657. }
  658. i = (m & 31) >> 4;
  659. #else
  660. i = m >> 4;
  661. #endif
  662. for (j = 0; j < i; j++)
  663. {
  664. FLOAT *BO;
  665. #if defined(TRMMKERNEL)
  666. REFRESH_POINTERS (16, 4);
  667. #else
  668. BO = B;
  669. temp = k;
  670. #endif
  671. v4sf_t *rowC;
  672. v4sf_t result[4];
  673. __vector_quad acc0, acc1, acc2, acc3;
  674. BLASLONG l = 0;
  675. vec_t *rowA = (vec_t *) & AO[0];
  676. vec_t *rowB = (vec_t *) & BO[0];
  677. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  678. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  679. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  680. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  681. for (l = 1; l < temp; l++)
  682. {
  683. rowA = (vec_t *) & AO[l << 4];
  684. rowB = (vec_t *) & BO[l << 2];
  685. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  686. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  687. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  688. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  689. }
  690. SAVE_ACC (&acc0, 0);
  691. SAVE_ACC (&acc1, 4);
  692. CO += 8;
  693. SAVE_ACC (&acc2, 0);
  694. SAVE_ACC (&acc3, 4);
  695. CO += 8;
  696. AO += temp << 4;
  697. BO += temp << 2;
  698. #if defined(TRMMKERNEL)
  699. REFRESH_AFTER_SAVE (16, 4)
  700. #endif
  701. }
  702. if (m & 8)
  703. {
  704. FLOAT *BO;
  705. #if defined(TRMMKERNEL)
  706. REFRESH_POINTERS (8, 4);
  707. #else
  708. BO = B;
  709. temp = k;
  710. #endif
  711. v4sf_t *rowC;
  712. v4sf_t result[4];
  713. __vector_quad acc0, acc1;
  714. BLASLONG l = 0;
  715. vec_t *rowA = (vec_t *) & AO[0];
  716. vec_t *rowB = (vec_t *) & BO[0];
  717. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  718. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  719. for (l = 1; l < temp; l++)
  720. {
  721. rowA = (vec_t *) & AO[l << 3];
  722. rowB = (vec_t *) & BO[l << 2];
  723. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  724. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  725. }
  726. SAVE_ACC (&acc0, 0);
  727. SAVE_ACC (&acc1, 4);
  728. CO += 8;
  729. AO += temp << 3;
  730. BO += temp << 2;
  731. #if defined(TRMMKERNEL)
  732. REFRESH_AFTER_SAVE (8, 4)
  733. #endif
  734. }
  735. if (m & 4)
  736. {
  737. FLOAT *BO;
  738. #if defined(TRMMKERNEL)
  739. REFRESH_POINTERS (4, 4);
  740. #else
  741. BO = B;
  742. temp = k;
  743. #endif
  744. v4sf_t *rowC;
  745. __vector_quad acc0;
  746. v4sf_t result[4];
  747. BLASLONG l = 0;
  748. vec_t *rowA = (vec_t *) & AO[0];
  749. vec_t *rowB = (vec_t *) & BO[0];
  750. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  751. for (l = 1; l < temp; l++)
  752. {
  753. rowA = (vec_t *) & AO[l << 2];
  754. rowB = (vec_t *) & BO[l << 2];
  755. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  756. }
  757. SAVE_ACC (&acc0, 0);
  758. CO += 4;
  759. AO += temp << 2;
  760. BO += temp << 2;
  761. #if defined(TRMMKERNEL)
  762. REFRESH_AFTER_SAVE (4, 4)
  763. #endif
  764. }
  765. if (m & 2)
  766. {
  767. FLOAT *BO;
  768. #if defined(TRMMKERNEL)
  769. REFRESH_POINTERS (2, 4);
  770. #else
  771. BO = B;
  772. temp = k;
  773. #endif
  774. v2sf_t *rowC;
  775. v2sf_t result[8];
  776. __vector_quad acc0;
  777. BLASLONG l = 0;
  778. FLOAT t[4] = { 0 };
  779. t[0] = AO[0], t[1] = AO[1];
  780. vec_t *rowA = (vec_t *) & t[0];
  781. vec_t *rowB = (vec_t *) & BO[0];
  782. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  783. for (l = 1; l < temp; l++)
  784. {
  785. t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
  786. rowA = (vec_t *) & t[0];
  787. rowB = (vec_t *) & BO[l << 2];
  788. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  789. }
  790. SAVE4x2_ACC (&acc0, 0);
  791. CO += 2;
  792. AO += temp << 1;
  793. BO += temp << 2;
  794. #if defined(TRMMKERNEL)
  795. REFRESH_AFTER_SAVE (2, 4)
  796. #endif
  797. }
  798. if (m & 1)
  799. {
  800. FLOAT *BO;
  801. #if defined(TRMMKERNEL)
  802. REFRESH_POINTERS (1, 4)
  803. #else
  804. BO = B;
  805. temp = k;
  806. #endif
  807. BLASLONG l = 0;
  808. v4sf_t t = { 0, 0, 0, 0 };
  809. for (l = 0; l < temp; l++)
  810. {
  811. v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
  812. v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
  813. BO[(l << 2) + 3]
  814. };
  815. t += rowA * rowB;
  816. }
  817. t = t * valpha;
  818. #if defined(TRMMKERNEL)
  819. CO[0 * ldc] = t[0];
  820. CO[1 * ldc] = t[1];
  821. CO[2 * ldc] = t[2];
  822. CO[3 * ldc] = t[3];
  823. #else
  824. CO[0 * ldc] += t[0];
  825. CO[1 * ldc] += t[1];
  826. CO[2 * ldc] += t[2];
  827. CO[3 * ldc] += t[3];
  828. #endif
  829. CO += 1;
  830. AO += temp;
  831. BO += temp << 2;
  832. #if defined(TRMMKERNEL)
  833. REFRESH_AFTER_SAVE (1, 4)
  834. #endif
  835. }
  836. #if defined(TRMMKERNEL) && !defined(LEFT)
  837. off += 4; // number of values in A
  838. #endif
  839. B += k << 2;
  840. }
  841. if (n & 2)
  842. {
  843. BLASLONG i, j, temp;
  844. #if defined(TRMMKERNEL) && defined(LEFT)
  845. off = offset;
  846. #endif
  847. FLOAT *CO;
  848. FLOAT *AO;
  849. CO = C;
  850. C += ldc << 1;
  851. AO = A;
  852. #if !defined(TRMMKERNEL)
  853. i = m >> 5;
  854. for (j = 0; j < i; j++)
  855. {
  856. FLOAT *BO = B;
  857. v4sf_t *rowC;
  858. v4sf_t result[4];
  859. FLOAT *A1;
  860. A1 = AO + (16 * k);
  861. __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
  862. BLASLONG l = 0;
  863. FLOAT t[4] = { 0 };
  864. t[0] = BO[0], t[1] = BO[1];
  865. vec_t *rowB = (vec_t *) & t[0];
  866. vec_t *rowA = (vec_t *) & AO[0];
  867. vec_t *rowA1 = (vec_t *) & A1[0];
  868. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  869. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  870. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  871. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  872. __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
  873. __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
  874. __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
  875. __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
  876. for (l = 1; l < k; l++)
  877. {
  878. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  879. rowB = (vec_t *) & t[0];
  880. rowA = (vec_t *) & AO[l << 4];
  881. rowA1 = (vec_t *) & A1[l << 4];
  882. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  883. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  884. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  885. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  886. __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
  887. __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
  888. __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
  889. __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
  890. }
  891. SAVE2x4_ACC (&acc0, 0);
  892. SAVE2x4_ACC (&acc1, 4);
  893. SAVE2x4_ACC (&acc2, 8);
  894. SAVE2x4_ACC (&acc3, 12);
  895. CO += 16;
  896. SAVE2x4_ACC (&acc4, 0);
  897. SAVE2x4_ACC (&acc5, 4);
  898. SAVE2x4_ACC (&acc6, 8);
  899. SAVE2x4_ACC (&acc7, 12);
  900. CO += 16;
  901. AO += k << 5;
  902. BO += k << 1;
  903. }
  904. i = (m & 31) >> 4;
  905. #else
  906. i = m >> 4;
  907. #endif
  908. for (j = 0; j < i; j++)
  909. {
  910. FLOAT *BO;
  911. v4sf_t *rowC;
  912. v4sf_t result[4];
  913. __vector_quad acc0, acc1, acc2, acc3;
  914. BLASLONG l = 0;
  915. #if defined(TRMMKERNEL)
  916. REFRESH_POINTERS (16, 2)
  917. #else
  918. BO = B;
  919. temp = k;
  920. #endif
  921. FLOAT t[4] = { 0 };
  922. t[0] = BO[0], t[1] = BO[1];
  923. vec_t *rowB = (vec_t *) & t[0];
  924. vec_t *rowA = (vec_t *) & AO[0];
  925. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  926. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  927. __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
  928. __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
  929. for (l = 1; l < temp; l++)
  930. {
  931. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  932. rowB = (vec_t *) & t[0];
  933. rowA = (vec_t *) & AO[l << 4];
  934. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  935. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  936. __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
  937. __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
  938. }
  939. SAVE2x4_ACC (&acc0, 0);
  940. SAVE2x4_ACC (&acc1, 4);
  941. SAVE2x4_ACC (&acc2, 8);
  942. SAVE2x4_ACC (&acc3, 12);
  943. CO += 16;
  944. AO += temp << 4;
  945. BO += temp << 1;
  946. #if defined(TRMMKERNEL)
  947. REFRESH_AFTER_SAVE (16, 2)
  948. #endif
  949. }
  950. if (m & 8)
  951. {
  952. FLOAT *BO;
  953. v4sf_t *rowC;
  954. v4sf_t result[4];
  955. __vector_quad acc0, acc1;
  956. #if defined(TRMMKERNEL)
  957. REFRESH_POINTERS (8, 2)
  958. #else
  959. BO = B;
  960. temp = k;
  961. #endif
  962. BLASLONG l = 0;
  963. FLOAT t[4] = { 0 };
  964. t[0] = BO[0], t[1] = BO[1];
  965. vec_t *rowB = (vec_t *) & t[0];
  966. vec_t *rowA = (vec_t *) & AO[0];
  967. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  968. __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
  969. for (l = 1; l < temp; l++)
  970. {
  971. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  972. rowB = (vec_t *) & t[0];
  973. rowA = (vec_t *) & AO[l << 3];
  974. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  975. __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
  976. }
  977. SAVE2x4_ACC (&acc0, 0);
  978. SAVE2x4_ACC (&acc1, 4);
  979. CO += 8;
  980. AO += temp << 3;
  981. BO += temp << 1;
  982. #if defined(TRMMKERNEL)
  983. REFRESH_AFTER_SAVE (8, 2)
  984. #endif
  985. }
  986. if (m & 4)
  987. {
  988. FLOAT *BO;
  989. v4sf_t *rowC;
  990. v4sf_t result[4];
  991. __vector_quad acc0;
  992. #if defined(TRMMKERNEL)
  993. REFRESH_POINTERS (4, 2)
  994. #else
  995. BO = B;
  996. temp = k;
  997. #endif
  998. BLASLONG l = 0;
  999. FLOAT t[4] = { 0 };
  1000. t[0] = BO[0], t[1] = BO[1];
  1001. vec_t *rowB = (vec_t *) & t[0];
  1002. vec_t *rowA = (vec_t *) & AO[0];
  1003. __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
  1004. for (l = 1; l < temp; l++)
  1005. {
  1006. t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
  1007. rowB = (vec_t *) & t[0];
  1008. rowA = (vec_t *) & AO[l << 2];
  1009. __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
  1010. }
  1011. SAVE2x4_ACC (&acc0, 0);
  1012. CO += 4;
  1013. AO += temp << 2;
  1014. BO += temp << 1;
  1015. #if defined(TRMMKERNEL)
  1016. REFRESH_AFTER_SAVE (4, 2)
  1017. #endif
  1018. }
  1019. if (m & 2)
  1020. {
  1021. FLOAT *BO;
  1022. BLASLONG l = 0;
  1023. #if defined(TRMMKERNEL)
  1024. REFRESH_POINTERS (2, 2)
  1025. #else
  1026. BO = B;
  1027. temp = k;
  1028. #endif
  1029. v4sf_t t = { 0, 0, 0, 0 };
  1030. for (l = 0; l < (temp << 1); l += 2)
  1031. {
  1032. v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
  1033. v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
  1034. t += rowA * rowB;
  1035. }
  1036. t = t * valpha;
  1037. #if defined(TRMMKERNEL)
  1038. CO[0 * ldc] = t[0];
  1039. CO[1 * ldc] = t[1];
  1040. CO[0 * ldc + 1] = t[2];
  1041. CO[1 * ldc + 1] = t[3];
  1042. #else
  1043. CO[0 * ldc] += t[0];
  1044. CO[1 * ldc] += t[1];
  1045. CO[0 * ldc + 1] += t[2];
  1046. CO[1 * ldc + 1] += t[3];
  1047. #endif
  1048. CO += 2;
  1049. AO += temp << 1;
  1050. BO += temp << 1;
  1051. #if defined(TRMMKERNEL)
  1052. REFRESH_AFTER_SAVE (2, 2)
  1053. #endif
  1054. }
  1055. if (m & 1)
  1056. {
  1057. FLOAT *BO;
  1058. BLASLONG l = 0;
  1059. #if defined(TRMMKERNEL)
  1060. REFRESH_POINTERS (1, 2)
  1061. #else
  1062. BO = B;
  1063. temp = k;
  1064. #endif
  1065. v4sf_t t = { 0, 0, 0, 0 };
  1066. for (l = 0; l < temp; l++)
  1067. {
  1068. v4sf_t rowA = { AO[l], AO[l], 0, 0 };
  1069. v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
  1070. t += rowA * rowB;
  1071. }
  1072. t = t * valpha;
  1073. #if defined(TRMMKERNEL)
  1074. CO[0 * ldc] = t[0];
  1075. CO[1 * ldc] = t[1];
  1076. #else
  1077. CO[0 * ldc] += t[0];
  1078. CO[1 * ldc] += t[1];
  1079. #endif
  1080. CO += 1;
  1081. AO += temp;
  1082. BO += temp << 1;
  1083. #if defined(TRMMKERNEL)
  1084. REFRESH_AFTER_SAVE (1, 2)
  1085. #endif
  1086. }
  1087. #if defined(TRMMKERNEL) && !defined(LEFT)
  1088. off += 2; // number of values in A
  1089. #endif
  1090. B += k << 1;
  1091. }
  1092. if (n & 1)
  1093. {
  1094. BLASLONG i, temp;
  1095. #if defined(TRMMKERNEL) && defined(LEFT)
  1096. off = offset;
  1097. #endif
  1098. FLOAT *CO;
  1099. FLOAT *AO;
  1100. CO = C;
  1101. C += ldc;
  1102. AO = A;
  1103. for (i = 0; i < (m >> 4); i++)
  1104. {
  1105. FLOAT *BO;
  1106. BLASLONG l = 0;
  1107. #if defined(TRMMKERNEL)
  1108. REFRESH_POINTERS (16, 1)
  1109. #else
  1110. BO = B;
  1111. temp = k;
  1112. #endif
  1113. v4sf_t t = { 0, 0, 0, 0 };
  1114. v4sf_t t1 = { 0, 0, 0, 0 };
  1115. v4sf_t t2 = { 0, 0, 0, 0 };
  1116. v4sf_t t3 = { 0, 0, 0, 0 };
  1117. for (l = 0; l < temp; l++)
  1118. {
  1119. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1120. v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
  1121. AO[(l << 4) + 3]
  1122. };
  1123. v4sf_t rowA1 =
  1124. { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
  1125. AO[(l << 4) + 7]
  1126. };
  1127. v4sf_t rowA2 =
  1128. { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
  1129. AO[(l << 4) + 11]
  1130. };
  1131. v4sf_t rowA3 =
  1132. { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
  1133. AO[(l << 4) + 15]
  1134. };
  1135. t += rowA * rowB;
  1136. t1 += rowA1 * rowB;
  1137. t2 += rowA2 * rowB;
  1138. t3 += rowA3 * rowB;
  1139. }
  1140. t = t * valpha;
  1141. t1 = t1 * valpha;
  1142. t2 = t2 * valpha;
  1143. t3 = t3 * valpha;
  1144. #if defined(TRMMKERNEL)
  1145. CO[0] = t[0];
  1146. CO[1] = t[1];
  1147. CO[2] = t[2];
  1148. CO[3] = t[3];
  1149. CO[4] = t1[0];
  1150. CO[5] = t1[1];
  1151. CO[6] = t1[2];
  1152. CO[7] = t1[3];
  1153. CO[8] = t2[0];
  1154. CO[9] = t2[1];
  1155. CO[10] = t2[2];
  1156. CO[11] = t2[3];
  1157. CO[12] = t3[0];
  1158. CO[13] = t3[1];
  1159. CO[14] = t3[2];
  1160. CO[15] = t3[3];
  1161. #else
  1162. CO[0] += t[0];
  1163. CO[1] += t[1];
  1164. CO[2] += t[2];
  1165. CO[3] += t[3];
  1166. CO[4] += t1[0];
  1167. CO[5] += t1[1];
  1168. CO[6] += t1[2];
  1169. CO[7] += t1[3];
  1170. CO[8] += t2[0];
  1171. CO[9] += t2[1];
  1172. CO[10] += t2[2];
  1173. CO[11] += t2[3];
  1174. CO[12] += t3[0];
  1175. CO[13] += t3[1];
  1176. CO[14] += t3[2];
  1177. CO[15] += t3[3];
  1178. #endif
  1179. AO += temp << 4;
  1180. BO += temp;
  1181. CO += 16;
  1182. #if defined(TRMMKERNEL)
  1183. REFRESH_AFTER_SAVE (16, 1)
  1184. #endif
  1185. }
  1186. if (m & 8)
  1187. {
  1188. FLOAT *BO;
  1189. BLASLONG l = 0;
  1190. v4sf_t t = { 0, 0, 0, 0 };
  1191. v4sf_t t1 = { 0, 0, 0, 0 };
  1192. #if defined(TRMMKERNEL)
  1193. REFRESH_POINTERS (8, 1)
  1194. #else
  1195. BO = B;
  1196. temp = k;
  1197. #endif
  1198. for (l = 0; l < temp; l++)
  1199. {
  1200. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1201. v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
  1202. AO[(l << 3) + 3]
  1203. };
  1204. v4sf_t rowA1 =
  1205. { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
  1206. AO[(l << 3) + 7]
  1207. };
  1208. t += rowA * rowB;
  1209. t1 += rowA1 * rowB;
  1210. }
  1211. t = t * valpha;
  1212. t1 = t1 * valpha;
  1213. #if defined(TRMMKERNEL)
  1214. CO[0] = t[0];
  1215. CO[1] = t[1];
  1216. CO[2] = t[2];
  1217. CO[3] = t[3];
  1218. CO[4] = t1[0];
  1219. CO[5] = t1[1];
  1220. CO[6] = t1[2];
  1221. CO[7] = t1[3];
  1222. #else
  1223. CO[0] += t[0];
  1224. CO[1] += t[1];
  1225. CO[2] += t[2];
  1226. CO[3] += t[3];
  1227. CO[4] += t1[0];
  1228. CO[5] += t1[1];
  1229. CO[6] += t1[2];
  1230. CO[7] += t1[3];
  1231. #endif
  1232. AO += temp << 3;
  1233. BO += temp;
  1234. CO += 8;
  1235. #if defined(TRMMKERNEL)
  1236. REFRESH_AFTER_SAVE (8, 1)
  1237. #endif
  1238. }
  1239. if (m & 4)
  1240. {
  1241. FLOAT *BO;
  1242. BLASLONG l = 0;
  1243. v4sf_t t = { 0, 0, 0, 0 };
  1244. #if defined(TRMMKERNEL)
  1245. REFRESH_POINTERS (4, 1)
  1246. #else
  1247. BO = B;
  1248. temp = k;
  1249. #endif
  1250. for (l = 0; l < temp; l++)
  1251. {
  1252. v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
  1253. v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
  1254. AO[(l << 2) + 3]
  1255. };
  1256. t += rowA * rowB;
  1257. }
  1258. t = t * valpha;
  1259. #if defined(TRMMKERNEL)
  1260. CO[0] = t[0];
  1261. CO[1] = t[1];
  1262. CO[2] = t[2];
  1263. CO[3] = t[3];
  1264. #else
  1265. CO[0] += t[0];
  1266. CO[1] += t[1];
  1267. CO[2] += t[2];
  1268. CO[3] += t[3];
  1269. #endif
  1270. AO += temp << 2;
  1271. BO += temp;
  1272. CO += 4;
  1273. #if defined(TRMMKERNEL)
  1274. REFRESH_AFTER_SAVE (4, 1)
  1275. #endif
  1276. }
  1277. if (m & 2)
  1278. {
  1279. FLOAT *BO;
  1280. BLASLONG l = 0;
  1281. #if defined(TRMMKERNEL)
  1282. REFRESH_POINTERS (2, 1)
  1283. #else
  1284. BO = B;
  1285. temp = k;
  1286. #endif
  1287. v4sf_t t = { 0, 0, 0, 0 };
  1288. for (l = 0; l < temp; l++)
  1289. {
  1290. v4sf_t rowB = { BO[l], BO[l], 0, 0 };
  1291. v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
  1292. t += rowA * rowB;
  1293. }
  1294. t = t * valpha;
  1295. #if defined(TRMMKERNEL)
  1296. CO[0] = t[0];
  1297. CO[1] = t[1];
  1298. #else
  1299. CO[0] += t[0];
  1300. CO[1] += t[1];
  1301. #endif
  1302. AO += temp << 1;
  1303. BO += temp;
  1304. CO += 2;
  1305. #if defined(TRMMKERNEL)
  1306. REFRESH_AFTER_SAVE (2, 1)
  1307. #endif
  1308. }
  1309. if (m & 1)
  1310. {
  1311. FLOAT *BO;
  1312. #if defined(TRMMKERNEL)
  1313. REFRESH_POINTERS (1, 1)
  1314. #else
  1315. BO = B;
  1316. temp = k;
  1317. #endif
  1318. BLASLONG l = 0;
  1319. FLOAT t = 0;
  1320. for (l = 0; l < temp; l++)
  1321. {
  1322. t += AO[l] * BO[l];
  1323. }
  1324. AO += temp;
  1325. BO += temp;
  1326. #if defined(TRMMKERNEL)
  1327. CO[0] = t * alpha;
  1328. #else
  1329. CO[0] += t * alpha;
  1330. #endif
  1331. CO += 1;
  1332. #if defined(TRMMKERNEL)
  1333. REFRESH_AFTER_SAVE (1, 1)
  1334. #endif
  1335. }
  1336. #if defined(TRMMKERNEL) && !defined(LEFT)
  1337. off += 1; // number of values in A
  1338. #endif
  1339. B += k;
  1340. }
  1341. return 0;
  1342. }