You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_16x2.c 18 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  3. {
  4. BLASLONG i,j,k;
  5. FLOAT *C0,*C1,*ptrba,*ptrbb;
  6. FLOAT res0_0;
  7. FLOAT res0_1;
  8. FLOAT res0_2;
  9. FLOAT res0_3;
  10. FLOAT res0_4;
  11. FLOAT res0_5;
  12. FLOAT res0_6;
  13. FLOAT res0_7;
  14. FLOAT res0_8;
  15. FLOAT res0_9;
  16. FLOAT res0_10;
  17. FLOAT res0_11;
  18. FLOAT res0_12;
  19. FLOAT res0_13;
  20. FLOAT res0_14;
  21. FLOAT res0_15;
  22. FLOAT res1_0;
  23. FLOAT res1_1;
  24. FLOAT res1_2;
  25. FLOAT res1_3;
  26. FLOAT res1_4;
  27. FLOAT res1_5;
  28. FLOAT res1_6;
  29. FLOAT res1_7;
  30. FLOAT res1_8;
  31. FLOAT res1_9;
  32. FLOAT res1_10;
  33. FLOAT res1_11;
  34. FLOAT res1_12;
  35. FLOAT res1_13;
  36. FLOAT res1_14;
  37. FLOAT res1_15;
  38. FLOAT a0;
  39. FLOAT a1;
  40. FLOAT b0;
  41. FLOAT b1;
  42. BLASLONG off, temp;
  43. #if !defined(LEFT)
  44. off = -offset;
  45. #else
  46. off = 0;
  47. #endif
  48. for (j=0; j<bn/2; j+=1)
  49. {
  50. C0 = C;
  51. C1 = C0+ldc;
  52. #if defined(TRMMKERNEL) && defined(LEFT)
  53. off = offset;
  54. #endif
  55. ptrba = ba;
  56. for (i=0; i<bm/16; i+=1)
  57. {
  58. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  59. ptrbb = bb;
  60. #else
  61. ptrba += off*16;
  62. ptrbb = bb + off*2;
  63. #endif
  64. res0_0 = 0;
  65. res0_1 = 0;
  66. res0_2 = 0;
  67. res0_3 = 0;
  68. res0_4 = 0;
  69. res0_5 = 0;
  70. res0_6 = 0;
  71. res0_7 = 0;
  72. res0_8 = 0;
  73. res0_9 = 0;
  74. res0_10 = 0;
  75. res0_11 = 0;
  76. res0_12 = 0;
  77. res0_13 = 0;
  78. res0_14 = 0;
  79. res0_15 = 0;
  80. res1_0 = 0;
  81. res1_1 = 0;
  82. res1_2 = 0;
  83. res1_3 = 0;
  84. res1_4 = 0;
  85. res1_5 = 0;
  86. res1_6 = 0;
  87. res1_7 = 0;
  88. res1_8 = 0;
  89. res1_9 = 0;
  90. res1_10 = 0;
  91. res1_11 = 0;
  92. res1_12 = 0;
  93. res1_13 = 0;
  94. res1_14 = 0;
  95. res1_15 = 0;
  96. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  97. temp = bk-off;
  98. #elif defined(LEFT)
  99. temp = off+16; // number of values in A
  100. #else
  101. temp = off+2; // number of values in B
  102. #endif
  103. for (k=0; k<temp; k++)
  104. {
  105. b0 = ptrbb[0];
  106. b1 = ptrbb[1];
  107. a0 = ptrba[0];
  108. res0_0 += a0*b0;
  109. res1_0 += a0*b1;
  110. a1 = ptrba[1];
  111. res0_1 += a1*b0;
  112. res1_1 += a1*b1;
  113. a0 = ptrba[2];
  114. res0_2 += a0*b0;
  115. res1_2 += a0*b1;
  116. a1 = ptrba[3];
  117. res0_3 += a1*b0;
  118. res1_3 += a1*b1;
  119. a0 = ptrba[4];
  120. res0_4 += a0*b0;
  121. res1_4 += a0*b1;
  122. a1 = ptrba[5];
  123. res0_5 += a1*b0;
  124. res1_5 += a1*b1;
  125. a0 = ptrba[6];
  126. res0_6 += a0*b0;
  127. res1_6 += a0*b1;
  128. a1 = ptrba[7];
  129. res0_7 += a1*b0;
  130. res1_7 += a1*b1;
  131. a0 = ptrba[8];
  132. res0_8 += a0*b0;
  133. res1_8 += a0*b1;
  134. a1 = ptrba[9];
  135. res0_9 += a1*b0;
  136. res1_9 += a1*b1;
  137. a0 = ptrba[10];
  138. res0_10 += a0*b0;
  139. res1_10 += a0*b1;
  140. a1 = ptrba[11];
  141. res0_11 += a1*b0;
  142. res1_11 += a1*b1;
  143. a0 = ptrba[12];
  144. res0_12 += a0*b0;
  145. res1_12 += a0*b1;
  146. a1 = ptrba[13];
  147. res0_13 += a1*b0;
  148. res1_13 += a1*b1;
  149. a0 = ptrba[14];
  150. res0_14 += a0*b0;
  151. res1_14 += a0*b1;
  152. a1 = ptrba[15];
  153. res0_15 += a1*b0;
  154. res1_15 += a1*b1;
  155. ptrba = ptrba+16;
  156. ptrbb = ptrbb+2;
  157. }
  158. res0_0 *= alpha;
  159. res0_1 *= alpha;
  160. res0_2 *= alpha;
  161. res0_3 *= alpha;
  162. res0_4 *= alpha;
  163. res0_5 *= alpha;
  164. res0_6 *= alpha;
  165. res0_7 *= alpha;
  166. res0_8 *= alpha;
  167. res0_9 *= alpha;
  168. res0_10 *= alpha;
  169. res0_11 *= alpha;
  170. res0_12 *= alpha;
  171. res0_13 *= alpha;
  172. res0_14 *= alpha;
  173. res0_15 *= alpha;
  174. res1_0 *= alpha;
  175. res1_1 *= alpha;
  176. res1_2 *= alpha;
  177. res1_3 *= alpha;
  178. res1_4 *= alpha;
  179. res1_5 *= alpha;
  180. res1_6 *= alpha;
  181. res1_7 *= alpha;
  182. res1_8 *= alpha;
  183. res1_9 *= alpha;
  184. res1_10 *= alpha;
  185. res1_11 *= alpha;
  186. res1_12 *= alpha;
  187. res1_13 *= alpha;
  188. res1_14 *= alpha;
  189. res1_15 *= alpha;
  190. C0[0] = res0_0;
  191. C0[1] = res0_1;
  192. C0[2] = res0_2;
  193. C0[3] = res0_3;
  194. C0[4] = res0_4;
  195. C0[5] = res0_5;
  196. C0[6] = res0_6;
  197. C0[7] = res0_7;
  198. C0[8] = res0_8;
  199. C0[9] = res0_9;
  200. C0[10] = res0_10;
  201. C0[11] = res0_11;
  202. C0[12] = res0_12;
  203. C0[13] = res0_13;
  204. C0[14] = res0_14;
  205. C0[15] = res0_15;
  206. C1[0] = res1_0;
  207. C1[1] = res1_1;
  208. C1[2] = res1_2;
  209. C1[3] = res1_3;
  210. C1[4] = res1_4;
  211. C1[5] = res1_5;
  212. C1[6] = res1_6;
  213. C1[7] = res1_7;
  214. C1[8] = res1_8;
  215. C1[9] = res1_9;
  216. C1[10] = res1_10;
  217. C1[11] = res1_11;
  218. C1[12] = res1_12;
  219. C1[13] = res1_13;
  220. C1[14] = res1_14;
  221. C1[15] = res1_15;
  222. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  223. temp = bk - off;
  224. #ifdef LEFT
  225. temp -= 16; // number of values in A
  226. #else
  227. temp -= 2; // number of values in B
  228. #endif
  229. ptrba += temp*16;
  230. ptrbb += temp*2;
  231. #endif
  232. #ifdef LEFT
  233. off += 16; // number of values in A
  234. #endif
  235. C0 = C0+16;
  236. C1 = C1+16;
  237. }
  238. if ( bm & 8)
  239. {
  240. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  241. ptrbb = bb;
  242. #else
  243. ptrba += off*8;
  244. ptrbb = bb + off*2;
  245. #endif
  246. res0_0 = 0;
  247. res0_1 = 0;
  248. res0_2 = 0;
  249. res0_3 = 0;
  250. res0_4 = 0;
  251. res0_5 = 0;
  252. res0_6 = 0;
  253. res0_7 = 0;
  254. res1_0 = 0;
  255. res1_1 = 0;
  256. res1_2 = 0;
  257. res1_3 = 0;
  258. res1_4 = 0;
  259. res1_5 = 0;
  260. res1_6 = 0;
  261. res1_7 = 0;
  262. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  263. temp = bk-off;
  264. #elif defined(LEFT)
  265. temp = off+8; // number of values in A
  266. #else
  267. temp = off+2; // number of values in B
  268. #endif
  269. for (k=0; k<temp; k++)
  270. {
  271. b0 = ptrbb[0];
  272. b1 = ptrbb[1];
  273. a0 = ptrba[0];
  274. res0_0 += a0*b0;
  275. res1_0 += a0*b1;
  276. a1 = ptrba[1];
  277. res0_1 += a1*b0;
  278. res1_1 += a1*b1;
  279. a0 = ptrba[2];
  280. res0_2 += a0*b0;
  281. res1_2 += a0*b1;
  282. a1 = ptrba[3];
  283. res0_3 += a1*b0;
  284. res1_3 += a1*b1;
  285. a0 = ptrba[4];
  286. res0_4 += a0*b0;
  287. res1_4 += a0*b1;
  288. a1 = ptrba[5];
  289. res0_5 += a1*b0;
  290. res1_5 += a1*b1;
  291. a0 = ptrba[6];
  292. res0_6 += a0*b0;
  293. res1_6 += a0*b1;
  294. a1 = ptrba[7];
  295. res0_7 += a1*b0;
  296. res1_7 += a1*b1;
  297. ptrba = ptrba+8;
  298. ptrbb = ptrbb+2;
  299. }
  300. res0_0 *= alpha;
  301. res0_1 *= alpha;
  302. res0_2 *= alpha;
  303. res0_3 *= alpha;
  304. res0_4 *= alpha;
  305. res0_5 *= alpha;
  306. res0_6 *= alpha;
  307. res0_7 *= alpha;
  308. res1_0 *= alpha;
  309. res1_1 *= alpha;
  310. res1_2 *= alpha;
  311. res1_3 *= alpha;
  312. res1_4 *= alpha;
  313. res1_5 *= alpha;
  314. res1_6 *= alpha;
  315. res1_7 *= alpha;
  316. C0[0] = res0_0;
  317. C0[1] = res0_1;
  318. C0[2] = res0_2;
  319. C0[3] = res0_3;
  320. C0[4] = res0_4;
  321. C0[5] = res0_5;
  322. C0[6] = res0_6;
  323. C0[7] = res0_7;
  324. C1[0] = res1_0;
  325. C1[1] = res1_1;
  326. C1[2] = res1_2;
  327. C1[3] = res1_3;
  328. C1[4] = res1_4;
  329. C1[5] = res1_5;
  330. C1[6] = res1_6;
  331. C1[7] = res1_7;
  332. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  333. temp = bk - off;
  334. #ifdef LEFT
  335. temp -= 8; // number of values in A
  336. #else
  337. temp -= 2; // number of values in B
  338. #endif
  339. ptrba += temp*8;
  340. ptrbb += temp*2;
  341. #endif
  342. #ifdef LEFT
  343. off += 8; // number of values in A
  344. #endif
  345. C0 = C0+8;
  346. C1 = C1+8;
  347. }
  348. if ( bm & 4 )
  349. {
  350. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  351. ptrbb = bb;
  352. #else
  353. ptrba += off*4;
  354. ptrbb = bb + off*2;
  355. #endif
  356. res0_0 = 0;
  357. res0_1 = 0;
  358. res0_2 = 0;
  359. res0_3 = 0;
  360. res1_0 = 0;
  361. res1_1 = 0;
  362. res1_2 = 0;
  363. res1_3 = 0;
  364. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  365. temp = bk-off;
  366. #elif defined(LEFT)
  367. temp = off+4; // number of values in A
  368. #else
  369. temp = off+2; // number of values in B
  370. #endif
  371. for (k=0; k<temp; k++)
  372. {
  373. b0 = ptrbb[0];
  374. b1 = ptrbb[1];
  375. a0 = ptrba[0];
  376. res0_0 += a0*b0;
  377. res1_0 += a0*b1;
  378. a1 = ptrba[1];
  379. res0_1 += a1*b0;
  380. res1_1 += a1*b1;
  381. a0 = ptrba[2];
  382. res0_2 += a0*b0;
  383. res1_2 += a0*b1;
  384. a1 = ptrba[3];
  385. res0_3 += a1*b0;
  386. res1_3 += a1*b1;
  387. ptrba = ptrba+4;
  388. ptrbb = ptrbb+2;
  389. }
  390. res0_0 *= alpha;
  391. res0_1 *= alpha;
  392. res0_2 *= alpha;
  393. res0_3 *= alpha;
  394. res1_0 *= alpha;
  395. res1_1 *= alpha;
  396. res1_2 *= alpha;
  397. res1_3 *= alpha;
  398. C0[0] = res0_0;
  399. C0[1] = res0_1;
  400. C0[2] = res0_2;
  401. C0[3] = res0_3;
  402. C1[0] = res1_0;
  403. C1[1] = res1_1;
  404. C1[2] = res1_2;
  405. C1[3] = res1_3;
  406. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  407. temp = bk - off;
  408. #ifdef LEFT
  409. temp -= 4; // number of values in A
  410. #else
  411. temp -= 2; // number of values in B
  412. #endif
  413. ptrba += temp*4;
  414. ptrbb += temp*2;
  415. #endif
  416. #ifdef LEFT
  417. off += 4; // number of values in A
  418. #endif
  419. C0 = C0+4;
  420. C1 = C1+4;
  421. }
  422. if ( bm & 2 )
  423. {
  424. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  425. ptrbb = bb;
  426. #else
  427. ptrba += off*2;
  428. ptrbb = bb + off*2;
  429. #endif
  430. res0_0 = 0;
  431. res0_1 = 0;
  432. res1_0 = 0;
  433. res1_1 = 0;
  434. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  435. temp = bk-off;
  436. #elif defined(LEFT)
  437. temp = off+2; // number of values in A
  438. #else
  439. temp = off+2; // number of values in B
  440. #endif
  441. for (k=0; k<temp; k++)
  442. {
  443. b0 = ptrbb[0];
  444. b1 = ptrbb[1];
  445. a0 = ptrba[0];
  446. res0_0 += a0*b0;
  447. res1_0 += a0*b1;
  448. a1 = ptrba[1];
  449. res0_1 += a1*b0;
  450. res1_1 += a1*b1;
  451. ptrba = ptrba+2;
  452. ptrbb = ptrbb+2;
  453. }
  454. res0_0 *= alpha;
  455. res0_1 *= alpha;
  456. res1_0 *= alpha;
  457. res1_1 *= alpha;
  458. C0[0] = res0_0;
  459. C0[1] = res0_1;
  460. C1[0] = res1_0;
  461. C1[1] = res1_1;
  462. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  463. temp = bk - off;
  464. #ifdef LEFT
  465. temp -= 2; // number of values in A
  466. #else
  467. temp -= 2; // number of values in B
  468. #endif
  469. ptrba += temp*2;
  470. ptrbb += temp*2;
  471. #endif
  472. #ifdef LEFT
  473. off += 2; // number of values in A
  474. #endif
  475. C0 = C0+2;
  476. C1 = C1+2;
  477. }
  478. if ( bm & 1 )
  479. {
  480. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  481. ptrbb = bb;
  482. #else
  483. ptrba += off*1;
  484. ptrbb = bb + off*2;
  485. #endif
  486. res0_0 = 0;
  487. res1_0 = 0;
  488. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  489. temp = bk-off;
  490. #elif defined(LEFT)
  491. temp = off+1; // number of values in A
  492. #else
  493. temp = off+2; // number of values in B
  494. #endif
  495. for (k=0; k<temp; k++)
  496. {
  497. b0 = ptrbb[0];
  498. b1 = ptrbb[1];
  499. a0 = ptrba[0];
  500. res0_0 += a0*b0;
  501. res1_0 += a0*b1;
  502. ptrba = ptrba+1;
  503. ptrbb = ptrbb+2;
  504. }
  505. res0_0 *= alpha;
  506. res1_0 *= alpha;
  507. C0[0] = res0_0;
  508. C1[0] = res1_0;
  509. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  510. temp = bk - off;
  511. #ifdef LEFT
  512. temp -= 1; // number of values in A
  513. #else
  514. temp -= 2; // number of values in B
  515. #endif
  516. ptrba += temp*1;
  517. ptrbb += temp*2;
  518. #endif
  519. #ifdef LEFT
  520. off += 1; // number of values in A
  521. #endif
  522. C0 = C0+1;
  523. C1 = C1+1;
  524. }
  525. #if defined(TRMMKERNEL) && !defined(LEFT)
  526. off += 2;
  527. #endif
  528. k = (bk<<1);
  529. bb = bb+k;
  530. i = (ldc<<1);
  531. C = C+i;
  532. }
  533. for (j=0; j<(bn&1); j+=1)
  534. {
  535. C0 = C;
  536. #if defined(TRMMKERNEL) && defined(LEFT)
  537. off = offset;
  538. #endif
  539. ptrba = ba;
  540. for (i=0; i<bm/16; i+=1)
  541. {
  542. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  543. ptrbb = bb;
  544. #else
  545. ptrba += off*16;
  546. ptrbb = bb + off*1;
  547. #endif
  548. res0_0 = 0;
  549. res0_1 = 0;
  550. res0_2 = 0;
  551. res0_3 = 0;
  552. res0_4 = 0;
  553. res0_5 = 0;
  554. res0_6 = 0;
  555. res0_7 = 0;
  556. res0_8 = 0;
  557. res0_9 = 0;
  558. res0_10 = 0;
  559. res0_11 = 0;
  560. res0_12 = 0;
  561. res0_13 = 0;
  562. res0_14 = 0;
  563. res0_15 = 0;
  564. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  565. temp = bk-off;
  566. #elif defined(LEFT)
  567. temp = off+16; // number of values in A
  568. #else
  569. temp = off+1; // number of values in B
  570. #endif
  571. for (k=0; k<temp; k++)
  572. {
  573. b0 = ptrbb[0];
  574. a0 = ptrba[0];
  575. res0_0 += a0*b0;
  576. a1 = ptrba[1];
  577. res0_1 += a1*b0;
  578. a0 = ptrba[2];
  579. res0_2 += a0*b0;
  580. a1 = ptrba[3];
  581. res0_3 += a1*b0;
  582. a0 = ptrba[4];
  583. res0_4 += a0*b0;
  584. a1 = ptrba[5];
  585. res0_5 += a1*b0;
  586. a0 = ptrba[6];
  587. res0_6 += a0*b0;
  588. a1 = ptrba[7];
  589. res0_7 += a1*b0;
  590. a0 = ptrba[8];
  591. res0_8 += a0*b0;
  592. a1 = ptrba[9];
  593. res0_9 += a1*b0;
  594. a0 = ptrba[10];
  595. res0_10 += a0*b0;
  596. a1 = ptrba[11];
  597. res0_11 += a1*b0;
  598. a0 = ptrba[12];
  599. res0_12 += a0*b0;
  600. a1 = ptrba[13];
  601. res0_13 += a1*b0;
  602. a0 = ptrba[14];
  603. res0_14 += a0*b0;
  604. a1 = ptrba[15];
  605. res0_15 += a1*b0;
  606. ptrba = ptrba+16;
  607. ptrbb = ptrbb+1;
  608. }
  609. res0_0 *= alpha;
  610. res0_1 *= alpha;
  611. res0_2 *= alpha;
  612. res0_3 *= alpha;
  613. res0_4 *= alpha;
  614. res0_5 *= alpha;
  615. res0_6 *= alpha;
  616. res0_7 *= alpha;
  617. res0_8 *= alpha;
  618. res0_9 *= alpha;
  619. res0_10 *= alpha;
  620. res0_11 *= alpha;
  621. res0_12 *= alpha;
  622. res0_13 *= alpha;
  623. res0_14 *= alpha;
  624. res0_15 *= alpha;
  625. C0[0] = res0_0;
  626. C0[1] = res0_1;
  627. C0[2] = res0_2;
  628. C0[3] = res0_3;
  629. C0[4] = res0_4;
  630. C0[5] = res0_5;
  631. C0[6] = res0_6;
  632. C0[7] = res0_7;
  633. C0[8] = res0_8;
  634. C0[9] = res0_9;
  635. C0[10] = res0_10;
  636. C0[11] = res0_11;
  637. C0[12] = res0_12;
  638. C0[13] = res0_13;
  639. C0[14] = res0_14;
  640. C0[15] = res0_15;
  641. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  642. temp = bk - off;
  643. #ifdef LEFT
  644. temp -= 16; // number of values in A
  645. #else
  646. temp -= 1; // number of values in B
  647. #endif
  648. ptrba += temp*16;
  649. ptrbb += temp*1;
  650. #endif
  651. #ifdef LEFT
  652. off += 16; // number of values in A
  653. #endif
  654. C0 = C0+16;
  655. }
  656. if ( bm & 8 )
  657. {
  658. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  659. ptrbb = bb;
  660. #else
  661. ptrba += off*8;
  662. ptrbb = bb + off*1;
  663. #endif
  664. res0_0 = 0;
  665. res0_1 = 0;
  666. res0_2 = 0;
  667. res0_3 = 0;
  668. res0_4 = 0;
  669. res0_5 = 0;
  670. res0_6 = 0;
  671. res0_7 = 0;
  672. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  673. temp = bk-off;
  674. #elif defined(LEFT)
  675. temp = off+8; // number of values in A
  676. #else
  677. temp = off+1; // number of values in B
  678. #endif
  679. for (k=0; k<temp; k++)
  680. {
  681. b0 = ptrbb[0];
  682. a0 = ptrba[0];
  683. res0_0 += a0*b0;
  684. a1 = ptrba[1];
  685. res0_1 += a1*b0;
  686. a0 = ptrba[2];
  687. res0_2 += a0*b0;
  688. a1 = ptrba[3];
  689. res0_3 += a1*b0;
  690. a0 = ptrba[4];
  691. res0_4 += a0*b0;
  692. a1 = ptrba[5];
  693. res0_5 += a1*b0;
  694. a0 = ptrba[6];
  695. res0_6 += a0*b0;
  696. a1 = ptrba[7];
  697. res0_7 += a1*b0;
  698. ptrba = ptrba+8;
  699. ptrbb = ptrbb+1;
  700. }
  701. res0_0 *= alpha;
  702. res0_1 *= alpha;
  703. res0_2 *= alpha;
  704. res0_3 *= alpha;
  705. res0_4 *= alpha;
  706. res0_5 *= alpha;
  707. res0_6 *= alpha;
  708. res0_7 *= alpha;
  709. C0[0] = res0_0;
  710. C0[1] = res0_1;
  711. C0[2] = res0_2;
  712. C0[3] = res0_3;
  713. C0[4] = res0_4;
  714. C0[5] = res0_5;
  715. C0[6] = res0_6;
  716. C0[7] = res0_7;
  717. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  718. temp = bk - off;
  719. #ifdef LEFT
  720. temp -= 8; // number of values in A
  721. #else
  722. temp -= 1; // number of values in B
  723. #endif
  724. ptrba += temp*8;
  725. ptrbb += temp*1;
  726. #endif
  727. #ifdef LEFT
  728. off += 8; // number of values in A
  729. #endif
  730. C0 = C0+8;
  731. }
  732. if ( bm & 4 )
  733. {
  734. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  735. ptrbb = bb;
  736. #else
  737. ptrba += off*4;
  738. ptrbb = bb + off*1;
  739. #endif
  740. res0_0 = 0;
  741. res0_1 = 0;
  742. res0_2 = 0;
  743. res0_3 = 0;
  744. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  745. temp = bk-off;
  746. #elif defined(LEFT)
  747. temp = off+4; // number of values in A
  748. #else
  749. temp = off+1; // number of values in B
  750. #endif
  751. for (k=0; k<temp; k++)
  752. {
  753. b0 = ptrbb[0];
  754. a0 = ptrba[0];
  755. res0_0 += a0*b0;
  756. a1 = ptrba[1];
  757. res0_1 += a1*b0;
  758. a0 = ptrba[2];
  759. res0_2 += a0*b0;
  760. a1 = ptrba[3];
  761. res0_3 += a1*b0;
  762. ptrba = ptrba+4;
  763. ptrbb = ptrbb+1;
  764. }
  765. res0_0 *= alpha;
  766. res0_1 *= alpha;
  767. res0_2 *= alpha;
  768. res0_3 *= alpha;
  769. C0[0] = res0_0;
  770. C0[1] = res0_1;
  771. C0[2] = res0_2;
  772. C0[3] = res0_3;
  773. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  774. temp = bk - off;
  775. #ifdef LEFT
  776. temp -= 4; // number of values in A
  777. #else
  778. temp -= 1; // number of values in B
  779. #endif
  780. ptrba += temp*4;
  781. ptrbb += temp*1;
  782. #endif
  783. #ifdef LEFT
  784. off += 4; // number of values in A
  785. #endif
  786. C0 = C0+4;
  787. }
  788. if ( bm & 2 )
  789. {
  790. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  791. ptrbb = bb;
  792. #else
  793. ptrba += off*2;
  794. ptrbb = bb + off*1;
  795. #endif
  796. res0_0 = 0;
  797. res0_1 = 0;
  798. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  799. temp = bk-off;
  800. #elif defined(LEFT)
  801. temp = off+2; // number of values in A
  802. #else
  803. temp = off+1; // number of values in B
  804. #endif
  805. for (k=0; k<temp; k++)
  806. {
  807. b0 = ptrbb[0];
  808. a0 = ptrba[0];
  809. res0_0 += a0*b0;
  810. a1 = ptrba[1];
  811. res0_1 += a1*b0;
  812. ptrba = ptrba+2;
  813. ptrbb = ptrbb+1;
  814. }
  815. res0_0 *= alpha;
  816. res0_1 *= alpha;
  817. C0[0] = res0_0;
  818. C0[1] = res0_1;
  819. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  820. temp = bk - off;
  821. #ifdef LEFT
  822. temp -= 2; // number of values in A
  823. #else
  824. temp -= 1; // number of values in B
  825. #endif
  826. ptrba += temp*2;
  827. ptrbb += temp*1;
  828. #endif
  829. #ifdef LEFT
  830. off += 2; // number of values in A
  831. #endif
  832. C0 = C0+2;
  833. }
  834. if ( bm & 1 )
  835. {
  836. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  837. ptrbb = bb;
  838. #else
  839. ptrba += off*1;
  840. ptrbb = bb + off*1;
  841. #endif
  842. res0_0 = 0;
  843. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  844. temp = bk-off;
  845. #elif defined(LEFT)
  846. temp = off+1; // number of values in A
  847. #else
  848. temp = off+1; // number of values in B
  849. #endif
  850. for (k=0; k<temp; k++)
  851. {
  852. b0 = ptrbb[0];
  853. a0 = ptrba[0];
  854. res0_0 += a0*b0;
  855. ptrba = ptrba+1;
  856. ptrbb = ptrbb+1;
  857. }
  858. res0_0 *= alpha;
  859. C0[0] = res0_0;
  860. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  861. temp = bk - off;
  862. #ifdef LEFT
  863. temp -= 1; // number of values in A
  864. #else
  865. temp -= 1; // number of values in B
  866. #endif
  867. ptrba += temp*1;
  868. ptrbb += temp*1;
  869. #endif
  870. #ifdef LEFT
  871. off += 1; // number of values in A
  872. #endif
  873. C0 = C0+1;
  874. }
  875. #if defined(TRMMKERNEL) && !defined(LEFT)
  876. off += 1;
  877. #endif
  878. k = (bk<<0);
  879. bb = bb+k;
  880. C = C+ldc;
  881. }
  882. return 0;
  883. }