You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmmkernel_16x2.c 18 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151
  1. #include "common.h"
  2. int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
  3. {
  4. BLASLONG i,j,k;
  5. FLOAT *C0,*C1,*ptrba,*ptrbb;
  6. FLOAT res0_0;
  7. FLOAT res0_1;
  8. FLOAT res0_2;
  9. FLOAT res0_3;
  10. FLOAT res0_4;
  11. FLOAT res0_5;
  12. FLOAT res0_6;
  13. FLOAT res0_7;
  14. FLOAT res0_8;
  15. FLOAT res0_9;
  16. FLOAT res0_10;
  17. FLOAT res0_11;
  18. FLOAT res0_12;
  19. FLOAT res0_13;
  20. FLOAT res0_14;
  21. FLOAT res0_15;
  22. FLOAT res1_0;
  23. FLOAT res1_1;
  24. FLOAT res1_2;
  25. FLOAT res1_3;
  26. FLOAT res1_4;
  27. FLOAT res1_5;
  28. FLOAT res1_6;
  29. FLOAT res1_7;
  30. FLOAT res1_8;
  31. FLOAT res1_9;
  32. FLOAT res1_10;
  33. FLOAT res1_11;
  34. FLOAT res1_12;
  35. FLOAT res1_13;
  36. FLOAT res1_14;
  37. FLOAT res1_15;
  38. FLOAT a0;
  39. FLOAT a1;
  40. FLOAT b0;
  41. FLOAT b1;
  42. BLASLONG off, temp;
  43. #if !defined(LEFT)
  44. off = -offset;
  45. #endif
  46. for (j=0; j<bn/2; j+=1)
  47. {
  48. C0 = C;
  49. C1 = C0+ldc;
  50. #if defined(TRMMKERNEL) && defined(LEFT)
  51. off = offset;
  52. #endif
  53. ptrba = ba;
  54. for (i=0; i<bm/16; i+=1)
  55. {
  56. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  57. ptrbb = bb;
  58. #else
  59. ptrba += off*16;
  60. ptrbb = bb + off*2;
  61. #endif
  62. res0_0 = 0;
  63. res0_1 = 0;
  64. res0_2 = 0;
  65. res0_3 = 0;
  66. res0_4 = 0;
  67. res0_5 = 0;
  68. res0_6 = 0;
  69. res0_7 = 0;
  70. res0_8 = 0;
  71. res0_9 = 0;
  72. res0_10 = 0;
  73. res0_11 = 0;
  74. res0_12 = 0;
  75. res0_13 = 0;
  76. res0_14 = 0;
  77. res0_15 = 0;
  78. res1_0 = 0;
  79. res1_1 = 0;
  80. res1_2 = 0;
  81. res1_3 = 0;
  82. res1_4 = 0;
  83. res1_5 = 0;
  84. res1_6 = 0;
  85. res1_7 = 0;
  86. res1_8 = 0;
  87. res1_9 = 0;
  88. res1_10 = 0;
  89. res1_11 = 0;
  90. res1_12 = 0;
  91. res1_13 = 0;
  92. res1_14 = 0;
  93. res1_15 = 0;
  94. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  95. temp = bk-off;
  96. #elif defined(LEFT)
  97. temp = off+16; // number of values in A
  98. #else
  99. temp = off+2; // number of values in B
  100. #endif
  101. for (k=0; k<temp; k++)
  102. {
  103. b0 = ptrbb[0];
  104. b1 = ptrbb[1];
  105. a0 = ptrba[0];
  106. res0_0 += a0*b0;
  107. res1_0 += a0*b1;
  108. a1 = ptrba[1];
  109. res0_1 += a1*b0;
  110. res1_1 += a1*b1;
  111. a0 = ptrba[2];
  112. res0_2 += a0*b0;
  113. res1_2 += a0*b1;
  114. a1 = ptrba[3];
  115. res0_3 += a1*b0;
  116. res1_3 += a1*b1;
  117. a0 = ptrba[4];
  118. res0_4 += a0*b0;
  119. res1_4 += a0*b1;
  120. a1 = ptrba[5];
  121. res0_5 += a1*b0;
  122. res1_5 += a1*b1;
  123. a0 = ptrba[6];
  124. res0_6 += a0*b0;
  125. res1_6 += a0*b1;
  126. a1 = ptrba[7];
  127. res0_7 += a1*b0;
  128. res1_7 += a1*b1;
  129. a0 = ptrba[8];
  130. res0_8 += a0*b0;
  131. res1_8 += a0*b1;
  132. a1 = ptrba[9];
  133. res0_9 += a1*b0;
  134. res1_9 += a1*b1;
  135. a0 = ptrba[10];
  136. res0_10 += a0*b0;
  137. res1_10 += a0*b1;
  138. a1 = ptrba[11];
  139. res0_11 += a1*b0;
  140. res1_11 += a1*b1;
  141. a0 = ptrba[12];
  142. res0_12 += a0*b0;
  143. res1_12 += a0*b1;
  144. a1 = ptrba[13];
  145. res0_13 += a1*b0;
  146. res1_13 += a1*b1;
  147. a0 = ptrba[14];
  148. res0_14 += a0*b0;
  149. res1_14 += a0*b1;
  150. a1 = ptrba[15];
  151. res0_15 += a1*b0;
  152. res1_15 += a1*b1;
  153. ptrba = ptrba+16;
  154. ptrbb = ptrbb+2;
  155. }
  156. res0_0 *= alpha;
  157. res0_1 *= alpha;
  158. res0_2 *= alpha;
  159. res0_3 *= alpha;
  160. res0_4 *= alpha;
  161. res0_5 *= alpha;
  162. res0_6 *= alpha;
  163. res0_7 *= alpha;
  164. res0_8 *= alpha;
  165. res0_9 *= alpha;
  166. res0_10 *= alpha;
  167. res0_11 *= alpha;
  168. res0_12 *= alpha;
  169. res0_13 *= alpha;
  170. res0_14 *= alpha;
  171. res0_15 *= alpha;
  172. res1_0 *= alpha;
  173. res1_1 *= alpha;
  174. res1_2 *= alpha;
  175. res1_3 *= alpha;
  176. res1_4 *= alpha;
  177. res1_5 *= alpha;
  178. res1_6 *= alpha;
  179. res1_7 *= alpha;
  180. res1_8 *= alpha;
  181. res1_9 *= alpha;
  182. res1_10 *= alpha;
  183. res1_11 *= alpha;
  184. res1_12 *= alpha;
  185. res1_13 *= alpha;
  186. res1_14 *= alpha;
  187. res1_15 *= alpha;
  188. C0[0] = res0_0;
  189. C0[1] = res0_1;
  190. C0[2] = res0_2;
  191. C0[3] = res0_3;
  192. C0[4] = res0_4;
  193. C0[5] = res0_5;
  194. C0[6] = res0_6;
  195. C0[7] = res0_7;
  196. C0[8] = res0_8;
  197. C0[9] = res0_9;
  198. C0[10] = res0_10;
  199. C0[11] = res0_11;
  200. C0[12] = res0_12;
  201. C0[13] = res0_13;
  202. C0[14] = res0_14;
  203. C0[15] = res0_15;
  204. C1[0] = res1_0;
  205. C1[1] = res1_1;
  206. C1[2] = res1_2;
  207. C1[3] = res1_3;
  208. C1[4] = res1_4;
  209. C1[5] = res1_5;
  210. C1[6] = res1_6;
  211. C1[7] = res1_7;
  212. C1[8] = res1_8;
  213. C1[9] = res1_9;
  214. C1[10] = res1_10;
  215. C1[11] = res1_11;
  216. C1[12] = res1_12;
  217. C1[13] = res1_13;
  218. C1[14] = res1_14;
  219. C1[15] = res1_15;
  220. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  221. temp = bk - off;
  222. #ifdef LEFT
  223. temp -= 16; // number of values in A
  224. #else
  225. temp -= 2; // number of values in B
  226. #endif
  227. ptrba += temp*16;
  228. ptrbb += temp*2;
  229. #endif
  230. #ifdef LEFT
  231. off += 16; // number of values in A
  232. #endif
  233. C0 = C0+16;
  234. C1 = C1+16;
  235. }
  236. if ( bm & 8)
  237. {
  238. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  239. ptrbb = bb;
  240. #else
  241. ptrba += off*8;
  242. ptrbb = bb + off*2;
  243. #endif
  244. res0_0 = 0;
  245. res0_1 = 0;
  246. res0_2 = 0;
  247. res0_3 = 0;
  248. res0_4 = 0;
  249. res0_5 = 0;
  250. res0_6 = 0;
  251. res0_7 = 0;
  252. res1_0 = 0;
  253. res1_1 = 0;
  254. res1_2 = 0;
  255. res1_3 = 0;
  256. res1_4 = 0;
  257. res1_5 = 0;
  258. res1_6 = 0;
  259. res1_7 = 0;
  260. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  261. temp = bk-off;
  262. #elif defined(LEFT)
  263. temp = off+8; // number of values in A
  264. #else
  265. temp = off+2; // number of values in B
  266. #endif
  267. for (k=0; k<temp; k++)
  268. {
  269. b0 = ptrbb[0];
  270. b1 = ptrbb[1];
  271. a0 = ptrba[0];
  272. res0_0 += a0*b0;
  273. res1_0 += a0*b1;
  274. a1 = ptrba[1];
  275. res0_1 += a1*b0;
  276. res1_1 += a1*b1;
  277. a0 = ptrba[2];
  278. res0_2 += a0*b0;
  279. res1_2 += a0*b1;
  280. a1 = ptrba[3];
  281. res0_3 += a1*b0;
  282. res1_3 += a1*b1;
  283. a0 = ptrba[4];
  284. res0_4 += a0*b0;
  285. res1_4 += a0*b1;
  286. a1 = ptrba[5];
  287. res0_5 += a1*b0;
  288. res1_5 += a1*b1;
  289. a0 = ptrba[6];
  290. res0_6 += a0*b0;
  291. res1_6 += a0*b1;
  292. a1 = ptrba[7];
  293. res0_7 += a1*b0;
  294. res1_7 += a1*b1;
  295. ptrba = ptrba+8;
  296. ptrbb = ptrbb+2;
  297. }
  298. res0_0 *= alpha;
  299. res0_1 *= alpha;
  300. res0_2 *= alpha;
  301. res0_3 *= alpha;
  302. res0_4 *= alpha;
  303. res0_5 *= alpha;
  304. res0_6 *= alpha;
  305. res0_7 *= alpha;
  306. res1_0 *= alpha;
  307. res1_1 *= alpha;
  308. res1_2 *= alpha;
  309. res1_3 *= alpha;
  310. res1_4 *= alpha;
  311. res1_5 *= alpha;
  312. res1_6 *= alpha;
  313. res1_7 *= alpha;
  314. C0[0] = res0_0;
  315. C0[1] = res0_1;
  316. C0[2] = res0_2;
  317. C0[3] = res0_3;
  318. C0[4] = res0_4;
  319. C0[5] = res0_5;
  320. C0[6] = res0_6;
  321. C0[7] = res0_7;
  322. C1[0] = res1_0;
  323. C1[1] = res1_1;
  324. C1[2] = res1_2;
  325. C1[3] = res1_3;
  326. C1[4] = res1_4;
  327. C1[5] = res1_5;
  328. C1[6] = res1_6;
  329. C1[7] = res1_7;
  330. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  331. temp = bk - off;
  332. #ifdef LEFT
  333. temp -= 8; // number of values in A
  334. #else
  335. temp -= 2; // number of values in B
  336. #endif
  337. ptrba += temp*8;
  338. ptrbb += temp*2;
  339. #endif
  340. #ifdef LEFT
  341. off += 8; // number of values in A
  342. #endif
  343. C0 = C0+8;
  344. C1 = C1+8;
  345. }
  346. if ( bm & 4 )
  347. {
  348. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  349. ptrbb = bb;
  350. #else
  351. ptrba += off*4;
  352. ptrbb = bb + off*2;
  353. #endif
  354. res0_0 = 0;
  355. res0_1 = 0;
  356. res0_2 = 0;
  357. res0_3 = 0;
  358. res1_0 = 0;
  359. res1_1 = 0;
  360. res1_2 = 0;
  361. res1_3 = 0;
  362. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  363. temp = bk-off;
  364. #elif defined(LEFT)
  365. temp = off+4; // number of values in A
  366. #else
  367. temp = off+2; // number of values in B
  368. #endif
  369. for (k=0; k<temp; k++)
  370. {
  371. b0 = ptrbb[0];
  372. b1 = ptrbb[1];
  373. a0 = ptrba[0];
  374. res0_0 += a0*b0;
  375. res1_0 += a0*b1;
  376. a1 = ptrba[1];
  377. res0_1 += a1*b0;
  378. res1_1 += a1*b1;
  379. a0 = ptrba[2];
  380. res0_2 += a0*b0;
  381. res1_2 += a0*b1;
  382. a1 = ptrba[3];
  383. res0_3 += a1*b0;
  384. res1_3 += a1*b1;
  385. ptrba = ptrba+4;
  386. ptrbb = ptrbb+2;
  387. }
  388. res0_0 *= alpha;
  389. res0_1 *= alpha;
  390. res0_2 *= alpha;
  391. res0_3 *= alpha;
  392. res1_0 *= alpha;
  393. res1_1 *= alpha;
  394. res1_2 *= alpha;
  395. res1_3 *= alpha;
  396. C0[0] = res0_0;
  397. C0[1] = res0_1;
  398. C0[2] = res0_2;
  399. C0[3] = res0_3;
  400. C1[0] = res1_0;
  401. C1[1] = res1_1;
  402. C1[2] = res1_2;
  403. C1[3] = res1_3;
  404. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  405. temp = bk - off;
  406. #ifdef LEFT
  407. temp -= 4; // number of values in A
  408. #else
  409. temp -= 2; // number of values in B
  410. #endif
  411. ptrba += temp*4;
  412. ptrbb += temp*2;
  413. #endif
  414. #ifdef LEFT
  415. off += 4; // number of values in A
  416. #endif
  417. C0 = C0+4;
  418. C1 = C1+4;
  419. }
  420. if ( bm & 2 )
  421. {
  422. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  423. ptrbb = bb;
  424. #else
  425. ptrba += off*2;
  426. ptrbb = bb + off*2;
  427. #endif
  428. res0_0 = 0;
  429. res0_1 = 0;
  430. res1_0 = 0;
  431. res1_1 = 0;
  432. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  433. temp = bk-off;
  434. #elif defined(LEFT)
  435. temp = off+2; // number of values in A
  436. #else
  437. temp = off+2; // number of values in B
  438. #endif
  439. for (k=0; k<temp; k++)
  440. {
  441. b0 = ptrbb[0];
  442. b1 = ptrbb[1];
  443. a0 = ptrba[0];
  444. res0_0 += a0*b0;
  445. res1_0 += a0*b1;
  446. a1 = ptrba[1];
  447. res0_1 += a1*b0;
  448. res1_1 += a1*b1;
  449. ptrba = ptrba+2;
  450. ptrbb = ptrbb+2;
  451. }
  452. res0_0 *= alpha;
  453. res0_1 *= alpha;
  454. res1_0 *= alpha;
  455. res1_1 *= alpha;
  456. C0[0] = res0_0;
  457. C0[1] = res0_1;
  458. C1[0] = res1_0;
  459. C1[1] = res1_1;
  460. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  461. temp = bk - off;
  462. #ifdef LEFT
  463. temp -= 2; // number of values in A
  464. #else
  465. temp -= 2; // number of values in B
  466. #endif
  467. ptrba += temp*2;
  468. ptrbb += temp*2;
  469. #endif
  470. #ifdef LEFT
  471. off += 2; // number of values in A
  472. #endif
  473. C0 = C0+2;
  474. C1 = C1+2;
  475. }
  476. if ( bm & 1 )
  477. {
  478. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  479. ptrbb = bb;
  480. #else
  481. ptrba += off*1;
  482. ptrbb = bb + off*2;
  483. #endif
  484. res0_0 = 0;
  485. res1_0 = 0;
  486. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  487. temp = bk-off;
  488. #elif defined(LEFT)
  489. temp = off+1; // number of values in A
  490. #else
  491. temp = off+2; // number of values in B
  492. #endif
  493. for (k=0; k<temp; k++)
  494. {
  495. b0 = ptrbb[0];
  496. b1 = ptrbb[1];
  497. a0 = ptrba[0];
  498. res0_0 += a0*b0;
  499. res1_0 += a0*b1;
  500. ptrba = ptrba+1;
  501. ptrbb = ptrbb+2;
  502. }
  503. res0_0 *= alpha;
  504. res1_0 *= alpha;
  505. C0[0] = res0_0;
  506. C1[0] = res1_0;
  507. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  508. temp = bk - off;
  509. #ifdef LEFT
  510. temp -= 1; // number of values in A
  511. #else
  512. temp -= 2; // number of values in B
  513. #endif
  514. ptrba += temp*1;
  515. ptrbb += temp*2;
  516. #endif
  517. #ifdef LEFT
  518. off += 1; // number of values in A
  519. #endif
  520. C0 = C0+1;
  521. C1 = C1+1;
  522. }
  523. #if defined(TRMMKERNEL) && !defined(LEFT)
  524. off += 2;
  525. #endif
  526. k = (bk<<1);
  527. bb = bb+k;
  528. i = (ldc<<1);
  529. C = C+i;
  530. }
  531. for (j=0; j<(bn&1); j+=1)
  532. {
  533. C0 = C;
  534. #if defined(TRMMKERNEL) && defined(LEFT)
  535. off = offset;
  536. #endif
  537. ptrba = ba;
  538. for (i=0; i<bm/16; i+=1)
  539. {
  540. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  541. ptrbb = bb;
  542. #else
  543. ptrba += off*16;
  544. ptrbb = bb + off*1;
  545. #endif
  546. res0_0 = 0;
  547. res0_1 = 0;
  548. res0_2 = 0;
  549. res0_3 = 0;
  550. res0_4 = 0;
  551. res0_5 = 0;
  552. res0_6 = 0;
  553. res0_7 = 0;
  554. res0_8 = 0;
  555. res0_9 = 0;
  556. res0_10 = 0;
  557. res0_11 = 0;
  558. res0_12 = 0;
  559. res0_13 = 0;
  560. res0_14 = 0;
  561. res0_15 = 0;
  562. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  563. temp = bk-off;
  564. #elif defined(LEFT)
  565. temp = off+16; // number of values in A
  566. #else
  567. temp = off+1; // number of values in B
  568. #endif
  569. for (k=0; k<temp; k++)
  570. {
  571. b0 = ptrbb[0];
  572. a0 = ptrba[0];
  573. res0_0 += a0*b0;
  574. a1 = ptrba[1];
  575. res0_1 += a1*b0;
  576. a0 = ptrba[2];
  577. res0_2 += a0*b0;
  578. a1 = ptrba[3];
  579. res0_3 += a1*b0;
  580. a0 = ptrba[4];
  581. res0_4 += a0*b0;
  582. a1 = ptrba[5];
  583. res0_5 += a1*b0;
  584. a0 = ptrba[6];
  585. res0_6 += a0*b0;
  586. a1 = ptrba[7];
  587. res0_7 += a1*b0;
  588. a0 = ptrba[8];
  589. res0_8 += a0*b0;
  590. a1 = ptrba[9];
  591. res0_9 += a1*b0;
  592. a0 = ptrba[10];
  593. res0_10 += a0*b0;
  594. a1 = ptrba[11];
  595. res0_11 += a1*b0;
  596. a0 = ptrba[12];
  597. res0_12 += a0*b0;
  598. a1 = ptrba[13];
  599. res0_13 += a1*b0;
  600. a0 = ptrba[14];
  601. res0_14 += a0*b0;
  602. a1 = ptrba[15];
  603. res0_15 += a1*b0;
  604. ptrba = ptrba+16;
  605. ptrbb = ptrbb+1;
  606. }
  607. res0_0 *= alpha;
  608. res0_1 *= alpha;
  609. res0_2 *= alpha;
  610. res0_3 *= alpha;
  611. res0_4 *= alpha;
  612. res0_5 *= alpha;
  613. res0_6 *= alpha;
  614. res0_7 *= alpha;
  615. res0_8 *= alpha;
  616. res0_9 *= alpha;
  617. res0_10 *= alpha;
  618. res0_11 *= alpha;
  619. res0_12 *= alpha;
  620. res0_13 *= alpha;
  621. res0_14 *= alpha;
  622. res0_15 *= alpha;
  623. C0[0] = res0_0;
  624. C0[1] = res0_1;
  625. C0[2] = res0_2;
  626. C0[3] = res0_3;
  627. C0[4] = res0_4;
  628. C0[5] = res0_5;
  629. C0[6] = res0_6;
  630. C0[7] = res0_7;
  631. C0[8] = res0_8;
  632. C0[9] = res0_9;
  633. C0[10] = res0_10;
  634. C0[11] = res0_11;
  635. C0[12] = res0_12;
  636. C0[13] = res0_13;
  637. C0[14] = res0_14;
  638. C0[15] = res0_15;
  639. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  640. temp = bk - off;
  641. #ifdef LEFT
  642. temp -= 16; // number of values in A
  643. #else
  644. temp -= 1; // number of values in B
  645. #endif
  646. ptrba += temp*16;
  647. ptrbb += temp*1;
  648. #endif
  649. #ifdef LEFT
  650. off += 16; // number of values in A
  651. #endif
  652. C0 = C0+16;
  653. }
  654. if ( bm & 8 )
  655. {
  656. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  657. ptrbb = bb;
  658. #else
  659. ptrba += off*8;
  660. ptrbb = bb + off*1;
  661. #endif
  662. res0_0 = 0;
  663. res0_1 = 0;
  664. res0_2 = 0;
  665. res0_3 = 0;
  666. res0_4 = 0;
  667. res0_5 = 0;
  668. res0_6 = 0;
  669. res0_7 = 0;
  670. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  671. temp = bk-off;
  672. #elif defined(LEFT)
  673. temp = off+8; // number of values in A
  674. #else
  675. temp = off+1; // number of values in B
  676. #endif
  677. for (k=0; k<temp; k++)
  678. {
  679. b0 = ptrbb[0];
  680. a0 = ptrba[0];
  681. res0_0 += a0*b0;
  682. a1 = ptrba[1];
  683. res0_1 += a1*b0;
  684. a0 = ptrba[2];
  685. res0_2 += a0*b0;
  686. a1 = ptrba[3];
  687. res0_3 += a1*b0;
  688. a0 = ptrba[4];
  689. res0_4 += a0*b0;
  690. a1 = ptrba[5];
  691. res0_5 += a1*b0;
  692. a0 = ptrba[6];
  693. res0_6 += a0*b0;
  694. a1 = ptrba[7];
  695. res0_7 += a1*b0;
  696. ptrba = ptrba+8;
  697. ptrbb = ptrbb+1;
  698. }
  699. res0_0 *= alpha;
  700. res0_1 *= alpha;
  701. res0_2 *= alpha;
  702. res0_3 *= alpha;
  703. res0_4 *= alpha;
  704. res0_5 *= alpha;
  705. res0_6 *= alpha;
  706. res0_7 *= alpha;
  707. C0[0] = res0_0;
  708. C0[1] = res0_1;
  709. C0[2] = res0_2;
  710. C0[3] = res0_3;
  711. C0[4] = res0_4;
  712. C0[5] = res0_5;
  713. C0[6] = res0_6;
  714. C0[7] = res0_7;
  715. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  716. temp = bk - off;
  717. #ifdef LEFT
  718. temp -= 8; // number of values in A
  719. #else
  720. temp -= 1; // number of values in B
  721. #endif
  722. ptrba += temp*8;
  723. ptrbb += temp*1;
  724. #endif
  725. #ifdef LEFT
  726. off += 8; // number of values in A
  727. #endif
  728. C0 = C0+8;
  729. }
  730. if ( bm & 4 )
  731. {
  732. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  733. ptrbb = bb;
  734. #else
  735. ptrba += off*4;
  736. ptrbb = bb + off*1;
  737. #endif
  738. res0_0 = 0;
  739. res0_1 = 0;
  740. res0_2 = 0;
  741. res0_3 = 0;
  742. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  743. temp = bk-off;
  744. #elif defined(LEFT)
  745. temp = off+4; // number of values in A
  746. #else
  747. temp = off+1; // number of values in B
  748. #endif
  749. for (k=0; k<temp; k++)
  750. {
  751. b0 = ptrbb[0];
  752. a0 = ptrba[0];
  753. res0_0 += a0*b0;
  754. a1 = ptrba[1];
  755. res0_1 += a1*b0;
  756. a0 = ptrba[2];
  757. res0_2 += a0*b0;
  758. a1 = ptrba[3];
  759. res0_3 += a1*b0;
  760. ptrba = ptrba+4;
  761. ptrbb = ptrbb+1;
  762. }
  763. res0_0 *= alpha;
  764. res0_1 *= alpha;
  765. res0_2 *= alpha;
  766. res0_3 *= alpha;
  767. C0[0] = res0_0;
  768. C0[1] = res0_1;
  769. C0[2] = res0_2;
  770. C0[3] = res0_3;
  771. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  772. temp = bk - off;
  773. #ifdef LEFT
  774. temp -= 4; // number of values in A
  775. #else
  776. temp -= 1; // number of values in B
  777. #endif
  778. ptrba += temp*4;
  779. ptrbb += temp*1;
  780. #endif
  781. #ifdef LEFT
  782. off += 4; // number of values in A
  783. #endif
  784. C0 = C0+4;
  785. }
  786. if ( bm & 2 )
  787. {
  788. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  789. ptrbb = bb;
  790. #else
  791. ptrba += off*2;
  792. ptrbb = bb + off*1;
  793. #endif
  794. res0_0 = 0;
  795. res0_1 = 0;
  796. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  797. temp = bk-off;
  798. #elif defined(LEFT)
  799. temp = off+2; // number of values in A
  800. #else
  801. temp = off+1; // number of values in B
  802. #endif
  803. for (k=0; k<temp; k++)
  804. {
  805. b0 = ptrbb[0];
  806. a0 = ptrba[0];
  807. res0_0 += a0*b0;
  808. a1 = ptrba[1];
  809. res0_1 += a1*b0;
  810. ptrba = ptrba+2;
  811. ptrbb = ptrbb+1;
  812. }
  813. res0_0 *= alpha;
  814. res0_1 *= alpha;
  815. C0[0] = res0_0;
  816. C0[1] = res0_1;
  817. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  818. temp = bk - off;
  819. #ifdef LEFT
  820. temp -= 2; // number of values in A
  821. #else
  822. temp -= 1; // number of values in B
  823. #endif
  824. ptrba += temp*2;
  825. ptrbb += temp*1;
  826. #endif
  827. #ifdef LEFT
  828. off += 2; // number of values in A
  829. #endif
  830. C0 = C0+2;
  831. }
  832. if ( bm & 1 )
  833. {
  834. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  835. ptrbb = bb;
  836. #else
  837. ptrba += off*1;
  838. ptrbb = bb + off*1;
  839. #endif
  840. res0_0 = 0;
  841. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  842. temp = bk-off;
  843. #elif defined(LEFT)
  844. temp = off+1; // number of values in A
  845. #else
  846. temp = off+1; // number of values in B
  847. #endif
  848. for (k=0; k<temp; k++)
  849. {
  850. b0 = ptrbb[0];
  851. a0 = ptrba[0];
  852. res0_0 += a0*b0;
  853. ptrba = ptrba+1;
  854. ptrbb = ptrbb+1;
  855. }
  856. res0_0 *= alpha;
  857. C0[0] = res0_0;
  858. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  859. temp = bk - off;
  860. #ifdef LEFT
  861. temp -= 1; // number of values in A
  862. #else
  863. temp -= 1; // number of values in B
  864. #endif
  865. ptrba += temp*1;
  866. ptrbb += temp*1;
  867. #endif
  868. #ifdef LEFT
  869. off += 1; // number of values in A
  870. #endif
  871. C0 = C0+1;
  872. }
  873. #if defined(TRMMKERNEL) && !defined(LEFT)
  874. off += 1;
  875. #endif
  876. k = (bk<<0);
  877. bb = bb+k;
  878. C = C+ldc;
  879. }
  880. return 0;
  881. }